Spaces:
Runtime error
Runtime error
import json | |
from typing import Any, Dict, List, Optional, Union | |
import gradio as gr | |
import httpx | |
from cachetools import TTLCache, cached | |
from gradio_client import Client | |
from toolz import groupby | |
CACHE_TIME = 60 * 60 * 1 # 1 hour | |
client = Client("https://librarian-bots-collection-papers-extractor.hf.space/") | |
def get_arxiv_ids_from_slug( | |
slug: str, | |
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]: | |
result = client.predict(slug, api_name="/predict") | |
with open(result) as f: | |
data = json.load(f) | |
return data | |
def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str: | |
return f"ArXiv:{arxiv_id}" | |
def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]: | |
arxiv_ids = [] | |
if exclude_keys is not None: | |
data = {k: v for k, v in data.items() if k not in exclude_keys} | |
# check if dict now empty | |
if not data: | |
return [] | |
for repo in data.values(): | |
if repo is None: | |
continue | |
for item in repo.values(): | |
arxiv_ids.extend(item["arxiv_ids"]) | |
# format for semantic scholar | |
return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids] | |
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]): | |
paper_ids = list(paper_ids) | |
print(paper_ids) | |
r = httpx.post( | |
"https://api.semanticscholar.org/recommendations/v1/papers/", | |
json={ | |
"positivePaperIds": paper_ids, | |
}, | |
params={"fields": "externalIds,title,year", "limit": 10}, | |
timeout=30, | |
) | |
print(r.text) | |
return r.json() | |
def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool: | |
return recommendation["externalIds"].get("ArXiv", None) is not None | |
def group_by_is_arxiv_paper( | |
recommendations: List[Dict[str, Any]] | |
) -> Dict[bool, List[Dict[str, Any]]]: | |
return groupby(is_arxiv_paper, recommendations) | |
def format_recommendation_into_markdown( | |
grouped_recommendations: Dict[bool, List[Dict[str, Any]]] | |
): | |
comment = "The following papers were recommended by the Semantic Scholar API \n\n" | |
arxiv_papers = grouped_recommendations.get(True) | |
if arxiv_papers: | |
comment += "## Papers available on Hugging Face Papers:\n\n" | |
for r in arxiv_papers: | |
hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" | |
comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" | |
other_papers = grouped_recommendations.get(False) | |
if other_papers: | |
comment += "\n\n## Other papers:\n\n" | |
for r in other_papers: | |
comment += f"* {r['title']} ({r['year']})\n" | |
return comment | |
def map_repo_name_to_api_key(repo_name: str) -> str: | |
return { | |
"datasets": "dataset papers", | |
"models": "model papers", | |
"papers": "papers", | |
}[repo_name] | |
def get_recommendations_from_slug( | |
slug: str, excluded_repo_types: Optional[list[str]] = None | |
): | |
excluded_repo_types = tuple(excluded_repo_types) | |
return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types) | |
def _get_recommendations_from_slug( | |
slug: str, excluded_repo_types: Optional[tuple[str]] = None | |
): | |
data = get_arxiv_ids_from_slug(slug) | |
if excluded_repo_types: | |
excluded_repo_types = list(excluded_repo_types) | |
excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types] | |
print(f"excluded_repo_types_remapped={excluded_repo_types}") | |
ids = format_ids(data, exclude_keys=excluded_repo_types) | |
if not ids: | |
return ( | |
"Based on your collection and exclusions" | |
f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try" | |
" removing some excluded repo types or adding more items to your" | |
" collection." | |
) | |
ids = tuple(ids) | |
recommendations = get_recommendations_from_semantic_scholar(ids) | |
recommendations = recommendations.get("recommendedPapers") | |
if recommendations is None: | |
raise gr.Error("Something went wrong with the Semantic Scholar API") | |
grouped = group_by_is_arxiv_paper(recommendations) | |
return format_recommendation_into_markdown(grouped) | |
title = """π Collections Reading List Generator π""" | |
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg" | |
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;"> | |
\n\n | |
Hugging Face Collections allow you to curate models, datasets, spaces, | |
and papers from the Hugging Face Hub. | |
This Space will generate a reading list based on the items in your collection. | |
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic! | |
The Space works by: | |
- finding any papers in your collection | |
- finding papers related to the models and datasets in your collection | |
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers. | |
You can optionally exclude certain repo types fromm consideration when generating the reading list. | |
""" | |
slug_input = gr.Textbox( | |
lines=1, | |
label="Collection Slug", | |
placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3", | |
) | |
example_slugs = [ | |
["merve/video-classification-models-6509edd0a6f657faa425e8c3", []], | |
["osanseviero/model-merging-65097893623330a3a51ead66", []], | |
["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []], | |
] | |
gr.Interface( | |
get_recommendations_from_slug, | |
inputs=[ | |
slug_input, | |
gr.Dropdown( | |
label="Repos to exclude from contributing to recommendations", | |
choices=["datasets", "models", "papers"], | |
multiselect=True, | |
), | |
], | |
outputs="markdown", | |
description=description, | |
title=title, | |
allow_flagging="never", | |
examples=example_slugs, | |
).launch() | |