Spaces:
Running
Running
# flake8: noqa: E501 | |
import os | |
from typing import Optional, Type | |
from asyncer import asyncify | |
from lagent.actions.base_action import AsyncActionMixin, BaseAction, tool_api | |
from lagent.schema import ActionReturn, ActionStatusCode | |
from .parser import BaseParser, JsonParser | |
class GoogleScholar(BaseAction): | |
"""Plugin for google scholar search. | |
Args: | |
api_key (str): API KEY to use serper google search API, | |
You can create a free API key at https://serper.dev. | |
description (dict): The description of the action. Defaults to ``None``. | |
parser (Type[BaseParser]): The parser class to process the | |
action's inputs and outputs. Defaults to :class:`JsonParser`. | |
""" | |
def __init__( | |
self, | |
api_key: Optional[str] = None, | |
description: Optional[dict] = None, | |
parser: Type[BaseParser] = JsonParser, | |
): | |
super().__init__(description, parser) | |
api_key = os.environ.get('SERPER_API_KEY', api_key) | |
if api_key is None: | |
raise ValueError( | |
'Please set Serper API key either in the environment ' | |
'as SERPER_API_KEY or pass it as `api_key` parameter.' | |
) | |
self.api_key = api_key | |
def search_google_scholar( | |
self, | |
query: str, | |
cites: Optional[str] = None, | |
as_ylo: Optional[int] = None, | |
as_yhi: Optional[int] = None, | |
scisbd: Optional[int] = None, | |
cluster: Optional[str] = None, | |
hl: Optional[str] = None, | |
lr: Optional[str] = None, | |
start: Optional[int] = None, | |
num: Optional[int] = None, | |
as_sdt: Optional[str] = None, | |
safe: Optional[str] = None, | |
filter: Optional[str] = None, | |
as_vis: Optional[str] = None, | |
) -> dict: | |
"""Search for scholarly articles based on a query according to the google scholar. | |
Args: | |
query (str): The query to search for. | |
cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches. | |
as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted). | |
as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted). | |
scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything. | |
cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches. | |
hl (Optional[str]): The language to use for the Google Scholar search. | |
lr (Optional[str]): One or multiple languages to limit the search to. | |
start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.) | |
num (Optional[int]): The maximum number of results to return, limited to 20. | |
as_sdt (Optional[str]): Can be used either as a search type or a filter. | |
safe (Optional[str]): The level of filtering for adult content. | |
filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off. | |
as_vis (Optional[str]): Defines whether to include citations or not. | |
Returns: | |
:class:`dict`: article information | |
- title: a list of the titles of the three selected papers | |
- cited_by: a list of the citation numbers of the three selected papers | |
- organic_id: a list of the organic results' ids of the three selected papers | |
- pub_info: publication information of selected papers | |
""" | |
from serpapi import GoogleSearch | |
params = { | |
'q': query, | |
'engine': 'google_scholar', | |
'api_key': self.api_key, | |
'cites': cites, | |
'as_ylo': as_ylo, | |
'as_yhi': as_yhi, | |
'scisbd': scisbd, | |
'cluster': cluster, | |
'hl': hl, | |
'lr': lr, | |
'start': start, | |
'num': num, | |
'as_sdt': as_sdt, | |
'safe': safe, | |
'filter': filter, | |
'as_vis': as_vis, | |
} | |
search = GoogleSearch(params) | |
try: | |
r = search.get_dict() | |
results = r['organic_results'] | |
title = [] | |
snippets = [] | |
cited_by = [] | |
organic_id = [] | |
pub_info = [] | |
for item in results[:3]: | |
title.append(item['title']) | |
pub_info.append(item['publication_info']['summary']) | |
citation = item['inline_links'].get('cited_by', {'total': ''}) | |
cited_by.append(citation['total']) | |
snippets.append(item['snippet']) | |
organic_id.append(item['result_id']) | |
return dict(title=title, cited_by=cited_by, organic_id=organic_id, snippets=snippets) | |
except Exception as e: | |
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR) | |
def get_author_information( | |
self, | |
author_id: str, | |
hl: Optional[str] = None, | |
view_op: Optional[str] = None, | |
sort: Optional[str] = None, | |
citation_id: Optional[str] = None, | |
start: Optional[int] = None, | |
num: Optional[int] = None, | |
no_cache: Optional[bool] = None, | |
async_req: Optional[bool] = None, | |
output: Optional[str] = None, | |
) -> dict: | |
"""Search for an author's information by author's id provided by get_author_id. | |
Args: | |
author_id (str): Required. The ID of an author. | |
hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'. | |
view_op (Optional[str]): Used for viewing specific parts of a page. | |
sort (Optional[str]): Used for sorting and refining articles. | |
citation_id (Optional[str]): Used for retrieving individual article citation. | |
start (Optional[int]): Defines the result offset. Default is 0. | |
num (Optional[int]): Defines the number of results to return. Default is 20. | |
no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False. | |
async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False. | |
output (Optional[str]): Defines the final output you want. Default is 'json'. | |
Returns: | |
:class:`dict`: author information | |
* name: author's name | |
* affliation: the affliation of the author | |
* articles: at most 3 articles by the author | |
* website: the author's homepage url | |
""" | |
from serpapi import GoogleSearch | |
params = { | |
'engine': 'google_scholar_author', | |
'author_id': author_id, | |
'api_key': self.api_key, | |
'hl': hl, | |
'view_op': view_op, | |
'sort': sort, | |
'citation_id': citation_id, | |
'start': start, | |
'num': num, | |
'no_cache': no_cache, | |
'async': async_req, | |
'output': output, | |
} | |
try: | |
search = GoogleSearch(params) | |
results = search.get_dict() | |
author = results['author'] | |
articles = results.get('articles', []) | |
return dict( | |
name=author['name'], | |
affiliations=author.get('affiliations', ''), | |
website=author.get('website', ''), | |
articles=[dict(title=article['title'], authors=article['authors']) for article in articles[:3]], | |
) | |
except Exception as e: | |
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR) | |
def get_citation_format( | |
self, | |
q: str, | |
no_cache: Optional[bool] = None, | |
async_: Optional[bool] = None, | |
output: Optional[str] = 'json', | |
) -> dict: | |
"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar. | |
Args: | |
q (str): ID of an individual Google Scholar organic search result. | |
no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None. | |
async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None. | |
output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'. | |
Returns: | |
:class:`dict`: citation format | |
* authors: the authors of the article | |
* citation: the citation format of the article | |
""" | |
from serpapi import GoogleSearch | |
params = { | |
'q': q, | |
'engine': 'google_scholar_cite', | |
'api_key': self.api_key, | |
'no_cache': no_cache, | |
'async': async_, | |
'output': output, | |
} | |
try: | |
search = GoogleSearch(params) | |
results = search.get_dict() | |
citation = results['citations'] | |
citation_info = citation[0]['snippet'] | |
return citation_info | |
except Exception as e: | |
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR) | |
def get_author_id( | |
self, | |
mauthors: str, | |
hl: Optional[str] = 'en', | |
after_author: Optional[str] = None, | |
before_author: Optional[str] = None, | |
no_cache: Optional[bool] = False, | |
_async: Optional[bool] = False, | |
output: Optional[str] = 'json', | |
) -> dict: | |
"""The getAuthorId function is used to get the author's id by his or her name. | |
Args: | |
mauthors (str): Defines the author you want to search for. | |
hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'. | |
after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None. | |
before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None. | |
no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False. | |
_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False. | |
output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'. | |
Returns: | |
:class:`dict`: author id | |
* author_id: the author_id of the author | |
""" | |
from serpapi import GoogleSearch | |
params = { | |
'mauthors': mauthors, | |
'engine': 'google_scholar_profiles', | |
'api_key': self.api_key, | |
'hl': hl, | |
'after_author': after_author, | |
'before_author': before_author, | |
'no_cache': no_cache, | |
'async': _async, | |
'output': output, | |
} | |
try: | |
search = GoogleSearch(params) | |
results = search.get_dict() | |
profile = results['profiles'] | |
author_info = dict(author_id=profile[0]['author_id']) | |
return author_info | |
except Exception as e: | |
return ActionReturn(errmsg=str(e), state=ActionStatusCode.HTTP_ERROR) | |
class AsyncGoogleScholar(AsyncActionMixin, GoogleScholar): | |
"""Plugin for google scholar search. | |
Args: | |
api_key (str): API KEY to use serper google search API, | |
You can create a free API key at https://serper.dev. | |
description (dict): The description of the action. Defaults to ``None``. | |
parser (Type[BaseParser]): The parser class to process the | |
action's inputs and outputs. Defaults to :class:`JsonParser`. | |
""" | |
def search_google_scholar( | |
self, | |
query: str, | |
cites: Optional[str] = None, | |
as_ylo: Optional[int] = None, | |
as_yhi: Optional[int] = None, | |
scisbd: Optional[int] = None, | |
cluster: Optional[str] = None, | |
hl: Optional[str] = None, | |
lr: Optional[str] = None, | |
start: Optional[int] = None, | |
num: Optional[int] = None, | |
as_sdt: Optional[str] = None, | |
safe: Optional[str] = None, | |
filter: Optional[str] = None, | |
as_vis: Optional[str] = None, | |
) -> dict: | |
"""Search for scholarly articles based on a query according to the google scholar. | |
Args: | |
query (str): The query to search for. | |
cites (Optional[str]): The unique ID of an article for triggering "Cited By" searches. | |
as_ylo (Optional[int]): The starting year for results (e.g., if as_ylo=2018, results before this year will be omitted). | |
as_yhi (Optional[int]): The ending year for results (e.g., if as_yhi=2018, results after this year will be omitted). | |
scisbd (Optional[int]): Defines articles added in the last year, sorted by date. It can be set to 1 to include only abstracts, or 2 to include everything. | |
cluster (Optional[str]): The unique ID of an article for triggering "All Versions" searches. | |
hl (Optional[str]): The language to use for the Google Scholar search. | |
lr (Optional[str]): One or multiple languages to limit the search to. | |
start (Optional[int]): The result offset for pagination (0 is the first page of results, 10 is the 2nd page, etc.) | |
num (Optional[int]): The maximum number of results to return, limited to 20. | |
as_sdt (Optional[str]): Can be used either as a search type or a filter. | |
safe (Optional[str]): The level of filtering for adult content. | |
filter (Optional[str]): Defines if the filters for 'Similar Results' and 'Omitted Results' are on or off. | |
as_vis (Optional[str]): Defines whether to include citations or not. | |
Returns: | |
:class:`dict`: article information | |
- title: a list of the titles of the three selected papers | |
- cited_by: a list of the citation numbers of the three selected papers | |
- organic_id: a list of the organic results' ids of the three selected papers | |
- pub_info: publication information of selected papers | |
""" | |
return super().search_google_scholar( | |
query, | |
cites, | |
as_ylo, | |
as_yhi, | |
scisbd, | |
cluster, | |
hl, | |
lr, | |
start, | |
num, | |
as_sdt, | |
safe, | |
filter, | |
as_vis, | |
) | |
def get_author_information( | |
self, | |
author_id: str, | |
hl: Optional[str] = None, | |
view_op: Optional[str] = None, | |
sort: Optional[str] = None, | |
citation_id: Optional[str] = None, | |
start: Optional[int] = None, | |
num: Optional[int] = None, | |
no_cache: Optional[bool] = None, | |
async_req: Optional[bool] = None, | |
output: Optional[str] = None, | |
) -> dict: | |
"""Search for an author's information by author's id provided by get_author_id. | |
Args: | |
author_id (str): Required. The ID of an author. | |
hl (Optional[str]): The language to use for the Google Scholar Author search. Default is 'en'. | |
view_op (Optional[str]): Used for viewing specific parts of a page. | |
sort (Optional[str]): Used for sorting and refining articles. | |
citation_id (Optional[str]): Used for retrieving individual article citation. | |
start (Optional[int]): Defines the result offset. Default is 0. | |
num (Optional[int]): Defines the number of results to return. Default is 20. | |
no_cache (Optional[bool]): Forces SerpApi to fetch the results even if a cached version is already present. Default is False. | |
async_req (Optional[bool]): Defines the way you want to submit your search to SerpApi. Default is False. | |
output (Optional[str]): Defines the final output you want. Default is 'json'. | |
Returns: | |
:class:`dict`: author information | |
* name: author's name | |
* affliation: the affliation of the author | |
* articles: at most 3 articles by the author | |
* website: the author's homepage url | |
""" | |
return super().get_author_information( | |
author_id, hl, view_op, sort, citation_id, start, num, no_cache, async_req, output | |
) | |
def get_citation_format( | |
self, | |
q: str, | |
no_cache: Optional[bool] = None, | |
async_: Optional[bool] = None, | |
output: Optional[str] = 'json', | |
) -> dict: | |
"""Function to get MLA citation format by an identification of organic_result's id provided by search_google_scholar. | |
Args: | |
q (str): ID of an individual Google Scholar organic search result. | |
no_cache (Optional[bool]): If set to True, will force SerpApi to fetch the Google Scholar Cite results even if a cached version is already present. Defaults to None. | |
async_ (Optional[bool]): If set to True, will submit search to SerpApi and retrieve results later. Defaults to None. | |
output (Optional[str]): Final output format. Set to 'json' to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'. | |
Returns: | |
:class:`dict`: citation format | |
* authors: the authors of the article | |
* citation: the citation format of the article | |
""" | |
return super().get_citation_format(q, no_cache, async_, output) | |
def get_author_id( | |
self, | |
mauthors: str, | |
hl: Optional[str] = 'en', | |
after_author: Optional[str] = None, | |
before_author: Optional[str] = None, | |
no_cache: Optional[bool] = False, | |
_async: Optional[bool] = False, | |
output: Optional[str] = 'json', | |
) -> dict: | |
"""The getAuthorId function is used to get the author's id by his or her name. | |
Args: | |
mauthors (str): Defines the author you want to search for. | |
hl (Optional[str]): Defines the language to use for the Google Scholar Profiles search. It's a two-letter language code. (e.g., 'en' for English, 'es' for Spanish, or 'fr' for French). Defaults to 'en'. | |
after_author (Optional[str]): Defines the next page token. It is used for retrieving the next page results. The parameter has the precedence over before_author parameter. Defaults to None. | |
before_author (Optional[str]): Defines the previous page token. It is used for retrieving the previous page results. Defaults to None. | |
no_cache (Optional[bool]): Will force SerpApi to fetch the Google Scholar Profiles results even if a cached version is already present. Defaults to False. | |
_async (Optional[bool]): Defines the way you want to submit your search to SerpApi. Defaults to False. | |
output (Optional[str]): Defines the final output you want. It can be set to 'json' (default) to get a structured JSON of the results, or 'html' to get the raw html retrieved. Defaults to 'json'. | |
Returns: | |
:class:`dict`: author id | |
* author_id: the author_id of the author | |
""" | |
return super().get_author_id(mauthors, hl, after_author, before_author, no_cache, _async, output) | |