Lagent / lagent /actions /google_scholar_search.py
Superkingjcj's picture
Upload 111 files
e679d69 verified
# flake8: noqa: E501
import os
from typing import Optional, Type
from asyncer import asyncify
from lagent.actions.base_action import AsyncActionMixin, BaseAction, tool_api
from lagent.schema import ActionReturn, ActionStatusCode
from .parser import BaseParser, JsonParser
class GoogleScholar(BaseAction):
"""Plugin for google scholar search.
Args:
api_key (str): API KEY to use serper google search API,
You can create a free API key at https://serper.dev.
description (dict): The description of the action. Defaults to ``None``.
parser (Type[BaseParser]): The parser class to process the
action's inputs and outputs. Defaults to :class:`JsonParser`.
"""
def __init__(
self,
api_key: Optional[str] = None,
description: Optional[dict] = None,
parser: Type[BaseParser] = JsonParser,
):
super().__init__(description, parser)
api_key = os.environ.get('SERPER_API_KEY', api_key)
if api_key is None:
raise ValueError(
'Please set Serper API key either in the environment '
'as SERPER_API_KEY or pass it as `api_key` parameter.'
)
self.api_key = api_key
@tool_api(explode_return=True)
def search_google_scholar(
self,
query: str,
cites: Optional[str] = None,
as_ylo: Optional[int] = None,
as_yhi: Optional[int] = None,
scisbd: Optional[int] = None,
cluster: Optional[str] = None,
hl: Optional[str] = None,
lr: Optional[str] = None,
start: Optional[int] = None,
num: Optional[int] = None,
as_sdt: Optional[str] = None,
safe: Optional[str] = None,
filter: Optional[str] = None,
as_vis: Optional[str] = None,
) -> dict:
"""Search for scholarly articles based on a query according to the google scholar.
Args:
query (str): The query to search for.
cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches.
as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted).
as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted).
scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything.
cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches.
hl (Optional[str]): The language to use for the Google Scholar search.
lr (Optional[str]): One or multiple languages to limit the search to.
start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.)
num (Optional[int]): The maximum number of results to return, limited to 20.
as_sdt (Optional[str]): Can be used either as a search type or a filter.
safe (Optional[str]): The level of filtering for adult content.
filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off.
as_vis (Optional[str]): Defines whether to include citations or not.
Returns:
:class:`dict`: article information
- title: a list of the titles of the three selected papers
- cited_by: a list of the citation numbers of the three selected papers
- organic_id: a list of the organic results' ids of the three selected papers
- pub_info: publication information of selected papers
"""
from serpapi import GoogleSearch
params = {
'q': query,
'engine': 'google_scholar',
'api_key': self.api_key,
'cites': cites,
'as_ylo': as_ylo,
'as_yhi': as_yhi,
'scisbd': scisbd,
'cluster': cluster,
'hl': hl,
'lr': lr,
'start': start,
'num': num,
'as_sdt': as_sdt,
'safe': safe,
'filter': filter,
'as_vis': as_vis,
}
search = GoogleSearch(params)
try:
r = search.get_dict()
results = r['organic_results']
title = []
snippets = []
cited_by = []
organic_id = []
pub_info = []
for item in results[:3]:
title.append(item['title'])
pub_info.append(item['publication_info']['summary'])
citation = item['inline_links'].get('cited_by', {'total': ''})
cited_by.append(citation['total'])
snippets.append(item['snippet'])
organic_id.append(item['result_id'])
return dict(title=title, cited_by=cited_by, organic_id=organic_id, snippets=snippets)
except Exception as e:
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)
@tool_api(explode_return=True)
def get_author_information(
self,
author_id: str,
hl: Optional[str] = None,
view_op: Optional[str] = None,
sort: Optional[str] = None,
citation_id: Optional[str] = None,
start: Optional[int] = None,
num: Optional[int] = None,
no_cache: Optional[bool] = None,
async_req: Optional[bool] = None,
output: Optional[str] = None,
) -> dict:
"""Search for an author's information by author's id provided by get_author_id.
Args:
author_id (str): Required. The ID of an author.
hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'.
view_op (Optional[str]): Used for viewing specific parts of a page.
sort (Optional[str]): Used for sorting and refining articles.
citation_id (Optional[str]): Used for retrieving individual article citation.
start (Optional[int]): Defines the result offset. Default is 0.
num (Optional[int]): Defines the number of results to return. Default is 20.
no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False.
async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False.
output (Optional[str]): Defines the final output you want. Default is 'json'.
Returns:
:class:`dict`: author information
* name: author's name
* affliation: the affliation of the author
* articles: at most 3 articles by the author
* website: the author's homepage url
"""
from serpapi import GoogleSearch
params = {
'engine': 'google_scholar_author',
'author_id': author_id,
'api_key': self.api_key,
'hl': hl,
'view_op': view_op,
'sort': sort,
'citation_id': citation_id,
'start': start,
'num': num,
'no_cache': no_cache,
'async': async_req,
'output': output,
}
try:
search = GoogleSearch(params)
results = search.get_dict()
author = results['author']
articles = results.get('articles', [])
return dict(
name=author['name'],
affiliations=author.get('affiliations', ''),
website=author.get('website', ''),
articles=[dict(title=article['title'], authors=article['authors']) for article in articles[:3]],
)
except Exception as e:
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)
@tool_api(explode_return=True)
def get_citation_format(
self,
q: str,
no_cache: Optional[bool] = None,
async_: Optional[bool] = None,
output: Optional[str] = 'json',
) -> dict:
"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar.
Args:
q (str): ID of an individual Google Scholar organic search result.
no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None.
async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None.
output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.
Returns:
:class:`dict`: citation format
* authors: the authors of the article
* citation: the citation format of the article
"""
from serpapi import GoogleSearch
params = {
'q': q,
'engine': 'google_scholar_cite',
'api_key': self.api_key,
'no_cache': no_cache,
'async': async_,
'output': output,
}
try:
search = GoogleSearch(params)
results = search.get_dict()
citation = results['citations']
citation_info = citation[0]['snippet']
return citation_info
except Exception as e:
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)
@tool_api(explode_return=True)
def get_author_id(
self,
mauthors: str,
hl: Optional[str] = 'en',
after_author: Optional[str] = None,
before_author: Optional[str] = None,
no_cache: Optional[bool] = False,
_async: Optional[bool] = False,
output: Optional[str] = 'json',
) -> dict:
"""The getAuthorId function is used to get the author's id by his or her name.
Args:
mauthors (str): Defines the author you want to search for.
hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'.
after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None.
before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None.
no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False.
_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False.
output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.
Returns:
:class:`dict`: author id
* author_id: the author_id of the author
"""
from serpapi import GoogleSearch
params = {
'mauthors': mauthors,
'engine': 'google_scholar_profiles',
'api_key': self.api_key,
'hl': hl,
'after_author': after_author,
'before_author': before_author,
'no_cache': no_cache,
'async': _async,
'output': output,
}
try:
search = GoogleSearch(params)
results = search.get_dict()
profile = results['profiles']
author_info = dict(author_id=profile[0]['author_id'])
return author_info
except Exception as e:
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR)
class AsyncGoogleScholar(AsyncActionMixin, GoogleScholar):
"""Plugin for google scholar search.
Args:
api_key (str): API KEY to use serper google search API,
You can create a free API key at https://serper.dev.
description (dict): The description of the action. Defaults to ``None``.
parser (Type[BaseParser]): The parser class to process the
action's inputs and outputs. Defaults to :class:`JsonParser`.
"""
@tool_api(explode_return=True)
@asyncify
def search_google_scholar(
self,
query: str,
cites: Optional[str] = None,
as_ylo: Optional[int] = None,
as_yhi: Optional[int] = None,
scisbd: Optional[int] = None,
cluster: Optional[str] = None,
hl: Optional[str] = None,
lr: Optional[str] = None,
start: Optional[int] = None,
num: Optional[int] = None,
as_sdt: Optional[str] = None,
safe: Optional[str] = None,
filter: Optional[str] = None,
as_vis: Optional[str] = None,
) -> dict:
"""Search for scholarly articles based on a query according to the google scholar.
Args:
query (str): The query to search for.
cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches.
as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted).
as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted).
scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything.
cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches.
hl (Optional[str]): The language to use for the Google Scholar search.
lr (Optional[str]): One or multiple languages to limit the search to.
start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.)
num (Optional[int]): The maximum number of results to return, limited to 20.
as_sdt (Optional[str]): Can be used either as a search type or a filter.
safe (Optional[str]): The level of filtering for adult content.
filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off.
as_vis (Optional[str]): Defines whether to include citations or not.
Returns:
:class:`dict`: article information
- title: a list of the titles of the three selected papers
- cited_by: a list of the citation numbers of the three selected papers
- organic_id: a list of the organic results' ids of the three selected papers
- pub_info: publication information of selected papers
"""
return super().search_google_scholar(
query,
cites,
as_ylo,
as_yhi,
scisbd,
cluster,
hl,
lr,
start,
num,
as_sdt,
safe,
filter,
as_vis,
)
@tool_api(explode_return=True)
@asyncify
def get_author_information(
self,
author_id: str,
hl: Optional[str] = None,
view_op: Optional[str] = None,
sort: Optional[str] = None,
citation_id: Optional[str] = None,
start: Optional[int] = None,
num: Optional[int] = None,
no_cache: Optional[bool] = None,
async_req: Optional[bool] = None,
output: Optional[str] = None,
) -> dict:
"""Search for an author's information by author's id provided by get_author_id.
Args:
author_id (str): Required. The ID of an author.
hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'.
view_op (Optional[str]): Used for viewing specific parts of a page.
sort (Optional[str]): Used for sorting and refining articles.
citation_id (Optional[str]): Used for retrieving individual article citation.
start (Optional[int]): Defines the result offset. Default is 0.
num (Optional[int]): Defines the number of results to return. Default is 20.
no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False.
async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False.
output (Optional[str]): Defines the final output you want. Default is 'json'.
Returns:
:class:`dict`: author information
* name: author's name
* affliation: the affliation of the author
* articles: at most 3 articles by the author
* website: the author's homepage url
"""
return super().get_author_information(
author_id, hl, view_op, sort, citation_id, start, num, no_cache, async_req, output
)
@tool_api(explode_return=True)
@asyncify
def get_citation_format(
self,
q: str,
no_cache: Optional[bool] = None,
async_: Optional[bool] = None,
output: Optional[str] = 'json',
) -> dict:
"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar.
Args:
q (str): ID of an individual Google Scholar organic search result.
no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None.
async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None.
output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.
Returns:
:class:`dict`: citation format
* authors: the authors of the article
* citation: the citation format of the article
"""
return super().get_citation_format(q, no_cache, async_, output)
@tool_api(explode_return=True)
@asyncify
def get_author_id(
self,
mauthors: str,
hl: Optional[str] = 'en',
after_author: Optional[str] = None,
before_author: Optional[str] = None,
no_cache: Optional[bool] = False,
_async: Optional[bool] = False,
output: Optional[str] = 'json',
) -> dict:
"""The getAuthorId function is used to get the author's id by his or her name.
Args:
mauthors (str): Defines the author you want to search for.
hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'.
after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None.
before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None.
no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False.
_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False.
output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'.
Returns:
:class:`dict`: author id
* author_id: the author_id of the author
"""
return super().get_author_id(mauthors, hl, after_author, before_author, no_cache, _async, output)