Spaces:
Sleeping
Sleeping
import os | |
from collections import OrderedDict | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
import gradio as gr | |
from shitsu import ShitsuScorer | |
from huggingface_hub import hf_hub_download | |
class OptimizedShitsuScorer: | |
def __init__(self, max_models=2): | |
self.scorers = OrderedDict() | |
self.max_models = max_models | |
self.current_language = None | |
def get_scorer(self, language): | |
if language in self.scorers: | |
# Move the accessed language to the end (most recently used) | |
self.scorers.move_to_end(language) | |
else: | |
gr.Warning("A new language is being loaded in memory, this could take a while...") | |
# If we're at capacity, remove the least recently used model | |
if len(self.scorers) >= self.max_models: | |
self.scorers.popitem(last=False) | |
# Load the new model | |
self.scorers[language] = ShitsuScorer(language) | |
self.current_language = language | |
return self.scorers[language] | |
def score(self, text, language): | |
scorer = self.get_scorer(language) | |
return scorer.score([text])[0] | |
def get_loaded_languages(self): | |
return list(self.scorers.keys()) | |
optimized_scorer = OptimizedShitsuScorer(max_models=2) | |
# Preload English model | |
optimized_scorer.get_scorer('en') | |
example_inputs = [ | |
"The Beatles were a popular band in the 1960s. They released many hit songs.", | |
"Chocolate is a type of sweet food that people often eat for dessert.", | |
"I'm thinking of going to the beach this weekend. The weather is supposed to be great!", | |
"Can you believe it's already September? This year is flying by!", | |
"Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles.", | |
] | |
def get_score(user_text, language): | |
score = optimized_scorer.score(user_text, language) | |
formatted_score = f"{score:.4g}" | |
loaded_languages = optimized_scorer.get_loaded_languages() | |
display_loaded_languages = [('Currently loaded languages: \n', None)] | |
for language in loaded_languages: | |
display_loaded_languages.append((language_map[language], language)) | |
display_loaded_languages.append((' ', None)) | |
return f'<div class="nice-box"> Score: {formatted_score}</div>', display_loaded_languages | |
language_options = ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'yo', 'zh'] | |
language_map = { | |
'am': 'Amharic', | |
'ar': 'Arabic', | |
'bg': 'Bulgarian', | |
'bn': 'Bengali', | |
'cs': 'Czech', | |
'da': 'Danish', | |
'de': 'German', | |
'el': 'Greek', | |
'en': 'English', | |
'es': 'Spanish', | |
'fa': 'Persian', | |
'fi': 'Finnish', | |
'fr': 'French', | |
'gu': 'Gujarati', | |
'ha': 'Hausa', | |
'hi': 'Hindi', | |
'hu': 'Hungarian', | |
'id': 'Indonesian', | |
'it': 'Italian', | |
'ja': 'Japanese', | |
'jv': 'Javanese', | |
'kn': 'Kannada', | |
'ko': 'Korean', | |
'lt': 'Lithuanian', | |
'mr': 'Marathi', | |
'nl': 'Dutch', | |
'no': 'Norwegian', | |
'yo': 'Yoruba', | |
'zh': 'Chinese' | |
} | |
color_map = { | |
"am": "green", # Ethiopia's flag has green | |
"ar": "black", # Many Arab flags feature black | |
"bg": "white", # Bulgaria's flag has white | |
"bn": "green", # Bangladesh's flag is green and red | |
"cs": "blue", # Czech Republic's flag has blue | |
"da": "red", # Denmark's flag is red and white | |
"de": "black", # Germany's flag has black | |
"el": "blue", # Greece's flag has blue | |
"en": "red", # UK/US flags have red | |
"es": "yellow", # Spain's flag has yellow | |
"fa": "green", # Iran's flag has green | |
"fi": "blue", # Finland's flag is blue and white | |
"fr": "blue", # France's flag has blue | |
"gu": "saffron", # India (Gujarat) flag's color | |
"ha": "green", # Nigeria's flag has green | |
"hi": "orange", # India's flag has orange | |
"hu": "red", # Hungary's flag has red | |
"id": "red", # Indonesia's flag is red and white | |
"it": "green", # Italy's flag has green | |
"ja": "red", # Japan's flag has a red sun | |
"jv": "brown", # Associated with traditional Javanese culture | |
"kn": "yellow", # Karnataka (Indian state) flag has yellow | |
"ko": "blue", # South Korea's flag has blue | |
"lt": "yellow", # Lithuania's flag has yellow | |
"mr": "saffron", # Marathi culture often uses saffron | |
"nl": "orange", # The Netherlands is often associated with orange | |
"no": "red", # Norway's flag is red, white, and blue | |
"yo": "green", # Nigeria's flag for Yoruba-speaking people | |
"zh": "red" # China's flag is red | |
} | |
css = ''' | |
#gen_btn{height: 100%} | |
#title{text-align: center} | |
#title h1{font-size: 3em; display:inline-flex; align-items:center} | |
#title img{width: 100px; margin-right: 0.5em} | |
#gallery .grid-wrap{height: 10vh} | |
.card_internal{display: flex;height: 100px;margin-top: .5em} | |
.card_internal img{margin-right: 1em} | |
.styler{--form-gap-width: 0px !important} | |
.nice-box { | |
border: 2px solid #007bff; | |
border-radius: 10px; | |
padding: 15px; | |
background-color: #f8f9fa; | |
font-size: 18px; | |
text-align: center; | |
min-height: 60px; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
} | |
''' | |
theme = gr.themes.Soft( | |
primary_hue="blue", | |
secondary_hue="sky", | |
) | |
with gr.Blocks(theme=theme, css=css) as demo: | |
title = gr.HTML( | |
"""<h1><img src="https://huggingface.co/spaces/Dusduo/shitsu-text-scorer-demo/resolve/main/shitsu-logo.jpeg" alt="LightBlue"> Shitsu Text Scorer</h1>""", | |
elem_id="title", | |
) | |
gr.Markdown( | |
"""This is a demo of [Shitsu text scorer](https://huggingface.co/lightblue/shitsu_text_scorer) for multiple languages, which scores text based on the amount of useful, textbook-like information in it. | |
It outputs a score generally between 0 and 1 but can exceed both of these bounds as it is a regressor. | |
⚠️ By default, the English version of the scorer is preloaded in memory. When using another language for the first time, beware extensive loading time. | |
""" | |
) | |
with gr.Row(): | |
user_text = gr.Textbox(label='Input text', placeholder='Type something here...') | |
with gr.Column(scale=0): | |
submit_btn = gr.Button("Submit") | |
score = gr.HTML( | |
value='<div class="nice-box"> Score... </div>', | |
label="Output" | |
) | |
with gr.Row(): | |
language_choice = gr.Dropdown( | |
choices=language_options, | |
label="Choose a language", | |
info="Type to search", | |
value="en", | |
allow_custom_value=True, | |
scale=3 | |
) | |
loaded_languages = gr.HighlightedText( | |
value = [('Currently loaded languages: \n', None), ('English', 'en')], | |
label="", | |
combine_adjacent=True, | |
show_legend=False, #True, | |
color_map=color_map, | |
scale=1) | |
#loaded_languages = gr.Markdown("Currently loaded languages: en") | |
gr.Examples(examples=example_inputs, inputs=user_text) | |
gr.Markdown( | |
""" | |
--- | |
## 🛈 **Additional Information** | |
This model can also be found on [Github](https://github.com/lightblue-tech/shitsu) and has its own pip installable package. | |
This model is based on fasttext embeddings, meaning that it can be used on large amounts of data with limited compute quickly. | |
This scorer can be used to filter useful information from large text corpora in many languages. | |
""" | |
) | |
submit_btn.click(get_score, inputs=[user_text, language_choice], outputs=[score, loaded_languages]) | |
demo.launch() | |