Spaces:
Running
Running
import streamlit as st | |
from streamlit_extras.switch_page_button import switch_page | |
translations = { | |
'en': {'title': 'MiniGemini', | |
'original_tweet': | |
""" | |
[Original tweet](https://x.com/mervenoyann/status/1783864388249694520) (April 26, 2024) | |
""", | |
'tweet_1': | |
""" | |
MiniGemini is the coolest VLM, let's explain 🧶 | |
""", | |
'tweet_2': | |
""" | |
MiniGemini is a vision language model that understands both image and text and also generates text and an image that goes best with the context! 🤯 | |
""", | |
'tweet_3': | |
""" | |
This model has two image encoders (one CNN and one ViT) in parallel to capture the details in the images. | |
I saw the same design in <a href='DocOwl_1.5' target='_self'>DocOwl 1.5</a> then it has a decoder to output text and also a prompt to be sent to SDXL for image generation (which works very well!) | |
""", | |
'tweet_4': | |
""" | |
They adopt CLIP's ViT for low resolution visual embedding encoder and a CNN-based one for high resolution image encoding (precisely a pre-trained ConvNeXt). | |
""", | |
'tweet_5': | |
""" | |
Thanks to the second encoder it can grasp details in images, which also comes in handy for e.g. document tasks (but see below the examples are mindblowing IMO). | |
""", | |
'tweet_6': | |
""" | |
According to their reporting the model performs very well across many benchmarks compared to LLaVA 1.5 and Gemini Pro. | |
""", | |
'ressources': | |
""" | |
Resources: | |
[Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models](https://huggingface.co/papers/2403.18814) | |
by Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, Jiaya Jia (2024) | |
[GitHub](https://github.com/dvlab-research/MGM) | |
[Model Repository](https://huggingface.co/YanweiLi/MGM-13B-HD) | |
""" | |
}, | |
'fr': { | |
'title': 'MiniGemini', | |
'original_tweet': | |
""" | |
[Tweet de base](https://x.com/mervenoyann/status/1783864388249694520) (26 avril 2024) | |
""", | |
'tweet_1': | |
""" | |
MiniGemini est le VLM le plus cool, voici pourquoi 🧶 | |
""", | |
'tweet_2': | |
""" | |
MiniGemini est un modèle de langage/vision qui comprend à la fois l'image et le texte et qui génère également le texte et l'image qui s'accordent le mieux avec le contexte ! 🤯 """, | |
'tweet_3': | |
""" | |
Ce modèle possède deux encodeurs d'images (un ConvNet et un ViT) en parallèle pour capturer les détails dans les images. | |
J'ai vu la même conception dans <a href='DocOwl 1.5' target='_self'>DocOwl 1.5</a> où il y a un décodeur pour produire du texte et aussi un prompt à envoyer au SDXL pour la génération d'images (qui fonctionne très bien !). """, | |
'tweet_4': | |
""" | |
Les auteurs adoptent le ViT de CLIP pour les enchâssements visuels de basse résolution et un ConvNet pour les images en haute résolution (précisément un ConvNeXt pré-entraîné). | |
""", | |
'tweet_5': | |
""" | |
Grâce au second encodeur, il peut saisir des détails dans les images, ce qui s'avère également utile pour les tâches documentaires (voir ci-dessous les exemples époustouflants). """, | |
'tweet_6': | |
""" | |
D'après leur rapport, le modèle est très performant dans de nombreux benchmarks par rapport à LLaVA 1.5 et Gemini Pro. | |
""", | |
'ressources': | |
""" | |
Resources : | |
[Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models](https://huggingface.co/papers/2403.18814) | |
de Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, Jiaya Jia (2024) | |
[GitHub](https://github.com/dvlab-research/MGM) | |
[Modèle](https://huggingface.co/YanweiLi/MGM-13B-HD) | |
""" | |
} | |
} | |
def language_selector(): | |
languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} | |
selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') | |
return 'en' if selected_lang == 'EN' else 'fr' | |
left_column, right_column = st.columns([5, 1]) | |
# Add a selector to the right column | |
with right_column: | |
lang = language_selector() | |
# Add a title to the left column | |
with left_column: | |
st.title(translations[lang]["title"]) | |
st.success(translations[lang]["original_tweet"], icon="ℹ️") | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_1.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_2.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_3.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_4.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_5.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/MiniGemini/image_6.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.info(translations[lang]["ressources"], icon="📚") | |
st.markdown(""" """) | |
st.markdown(""" """) | |
st.markdown(""" """) | |
col1, col2, col3= st.columns(3) | |
with col1: | |
if lang == "en": | |
if st.button('Previous paper', use_container_width=True): | |
switch_page("DocOwl 1.5") | |
else: | |
if st.button('Papier précédent', use_container_width=True): | |
switch_page("DocOwl 1.5") | |
with col2: | |
if lang == "en": | |
if st.button("Home", use_container_width=True): | |
switch_page("Home") | |
else: | |
if st.button("Accueil", use_container_width=True): | |
switch_page("Home") | |
with col3: | |
if lang == "en": | |
if st.button("Next paper", use_container_width=True): | |
switch_page("CuMo") | |
else: | |
if st.button("Papier suivant", use_container_width=True): | |
switch_page("PLLaVA") |