Spaces:
Running
on
Zero
Running
on
Zero
from collections import OrderedDict | |
from datetime import datetime | |
from typing import Optional | |
class CacheHandler: | |
def __init__(self, max_cache_size: int = 1000): | |
# Using OrderedDict to maintain the order of insertion for efficient removal of oldest items | |
self.cache = OrderedDict() | |
self.cache["https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"] = { | |
"title": "Iker García-Ferrero | Personal Webpage", | |
"date": datetime.now(), | |
"text": """I am currently a PhD candidate specializing in Natural Language Processing (NLP) at the University of the Basque Country UPV/EHU, IXA Group, and HiTZ Basque Center for Language Technologies, funded by a grant from the Basque Government. My advisors are German Rigau and Rodrigo Agerri. I anticipate concluding my PhD by early 2024. | |
My previous experiences include an internship as an Applied Scientist at Amazon Barcelona, where I was part of Lluis Marquez's team. I also served as Visiting Associate for 4 months at the School of Engineering and Applied Science, Department of Computer and Information Science, Cognitive Computation Group at the University of Pennsylvania under the supervision of Dan Roth. | |
My research primarily focuses on Multilingual Natural Language Processing. I aim to develop deep learning models and resources that enable NLP in languages with limited or no available resources. This research branches in two directions. First, data-transfer methods for which I have developed state-of-the-art techniques to automatically generate annotated data for languages that lack these resources. Second, model-transfer methods, a field in which I've made significant contributions to improve the zero-shot cross-lingual performance of NLP models. Recently, my research has branched into training Large Language Models (LLMs) for various tasks and domains. The most notable ones being GoLLIE a 34B parameter LLM which achieves state-of-the-art results for zero-shot Information Extraction, and MedMT5, the first open-source text-to-text multilingual model for the medical domain. | |
""", | |
"summary_0": "Iker García-Ferrero es un candidato a PhD en Natural Language Processing (NLP) " | |
"en la Universidad del País Vasco UPV/EHU, IXA Group y HiTZ Centro Vasco de Tecnología de la " | |
"Lengua, financiado por una beca del Gobierno Vasco. " | |
"En el pasado, ha realizado prácticas en Amazon y ha realizado una estancia " | |
"de investigación en la Universidad de Pensilvania (EEUU)." | |
"Sus investigaciones se centran en la creación de modelos y recursos para NLP en " | |
"lenguas con pocos o ningún recurso disponible, utilizando técnicas de transferencia de " | |
"datos y modelos. Recientemente también se ha especializado en el entrenamiento de LLMs", | |
"summary_50": "Iker García-Ferrero es un candidato a PhD en NLP en la Universidad del País Vasco, " | |
"con experiencia en Amazon, la Universidad de Pensilvania e HiTZ.", | |
"summary_100": "Iker García-Ferrero es un candidato a PhD en NLP.", | |
} | |
self.max_cache_size = max_cache_size | |
self.misses = 0 | |
self.hits = 0 | |
def add_to_cache( | |
self, url: str, title: str, text: str, summary_type: int, summary: str | |
): | |
# If URL already exists, update it and move it to the end to mark it as the most recently used | |
if url in self.cache: | |
self.cache.move_to_end(url) | |
self.cache[url][f"summary_{summary_type}"] = summary | |
self.cache[url]["date"] = datetime.now() | |
else: | |
# Add new entry to the cache | |
self.cache[url] = { | |
"title": title, | |
"text": text, | |
"date": datetime.now(), | |
"summary_0": summary if summary_type == 0 else None, | |
"summary_50": summary if summary_type == 50 else None, | |
"summary_100": summary if summary_type == 100 else None, | |
} | |
# Remove the oldest item if cache exceeds max size | |
if len(self.cache) > self.max_cache_size: | |
self.cache.move_to_end( | |
"https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/" | |
) # This is the default value in the demo, so we don't want to remove it | |
self.cache.popitem(last=False) # pop the oldest item | |
def get_from_cache( | |
self, url: str, summary_type: int, second_try: bool = False | |
) -> Optional[tuple]: | |
if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None: | |
# Move the accessed item to the end to mark it as recently used | |
self.cache.move_to_end(url) | |
self.hits += 1 | |
if second_try: | |
# In the first try we didn't get the cache hit, probably because it was a shortened URL | |
# So me decrease the number of misses, because we got the cache hit in the end | |
self.misses -= 1 | |
return ( | |
self.cache[url]["title"], | |
self.cache[url]["text"], | |
self.cache[url][f"summary_{summary_type}"], | |
) | |
else: | |
if not second_try: | |
self.misses += 1 | |
return None, None, None | |
def get_cache_stats(self): | |
return self.hits, self.misses, len(self.cache) | |