Spaces:

rjadr
/

ditaduranuncamais_explorer

Sleeping

App Files Files Community

rjadr commited on May 14

Commit

a651226

verified ·

1 Parent(s): 42da8d1

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -38

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 import datasets
 import plotly.express as px
-from sentence_transformers import SentenceTransformer
 from PIL import Image
 import os
 from pandas.api.types import (
@@ -29,28 +29,30 @@ from datetime import datetime
 #st.set_page_config(layout="wide")
-model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
-@st.cache_data(show_spinner=True)
-def download_models():
-    # Directory doesn't exist, download and extract the model
-    subprocess.run(["mkdir", "models"])
-    subprocess.run(["wget", "--no-check-certificate", "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/clip-ViT-B-32-multilingual-v1.zip"], check=True)
-    subprocess.run(["unzip", "-q", "clip-ViT-B-32-multilingual-v1.zip", "-d", model_dir], check=True)
 token_ = st.secrets["token"]
 @st.cache_data(show_spinner=True)
 def load_dataset():
-    dataset = datasets.load_dataset('rjadr/ditaduranuncamais', split='train', use_auth_token=token_)
-    dataset.add_faiss_index(column="txt_embs")
     dataset.add_faiss_index(column="img_embs")
     dataset = dataset.remove_columns(['Post Created Date', 'Post Created Time','Like and View Counts Disabled','Link','Download URL','Views'])
     return dataset
 @st.cache_data(show_spinner=False)
 def load_dataframe(_dataset):
-    dataframe = _dataset.remove_columns(['txt_embs', 'img_embs']).to_pandas()
     # Extract hashtags ith regex and convert to set
     dataframe['Hashtags'] = dataframe.apply(lambda row: f"{row['Description']} {row['Image Text']}", axis=1)
     dataframe['Hashtags'] = dataframe['Hashtags'].str.lower().str.findall(r'#(\w+)').apply(set)
@@ -60,16 +62,6 @@ def load_dataframe(_dataset):
     dataframe = dataframe[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in dataframe.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
     return dataframe
-@st.cache_resource(show_spinner=True)
-def load_img_model():
-    # We use the original clip-ViT-B-32 for encoding images
-    return SentenceTransformer('clip-ViT-B-32')
-@st.cache_resource(show_spinner=True)
-def load_txt_model():
-    # Our text embedding model is aligned to the img_model and maps 50+
-    # languages to the same vector space
-    return SentenceTransformer('./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1')
 def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
@@ -146,27 +138,62 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     return df
 @st.cache_data
-def get_image_embs(image):
     """
     Get image embeddings
     Parameters:
     uploaded_file (PIL.Image): Uploaded image file
     Returns:
     img_emb (np.array): Image embeddings
     """
-    img_emb = image_model.encode(Image.open(image))
     return img_emb
 @st.cache_data(show_spinner=False)
-def get_text_embs(text):
     """
     Get text embeddings
     Parameters:
     text (str): Text to encode
     Returns:
     text_emb (np.array): Text embeddings
     """
-    txt_emb = text_model.encode(text)
     return txt_emb
 @st.cache_data
@@ -186,7 +213,7 @@ def postprocess_results(scores, samples):
     samples_df["score"] = samples_df["score"].astype(int)
     samples_df.reset_index(inplace=True, drop=True)
     samples_df = samples_df[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in samples_df.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
-    return samples_df.drop(columns=['txt_embs', 'img_embs'])
 @st.cache_data
 def text_to_text(text, k=5):
@@ -198,8 +225,8 @@ def text_to_text(text, k=5):
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
-    text_emb = get_text_embs(text)
-    scores, samples = dataset.get_nearest_examples('txt_embs', text_emb, k=k)
     return postprocess_results(scores, samples)
 @st.cache_data
@@ -212,8 +239,8 @@ def image_to_text(image, k=5):
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
-    img_emb = get_image_embs(image.name)
-    scores, samples = dataset.get_nearest_examples('txt_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
 @st.cache_data
@@ -226,7 +253,7 @@ def text_to_image(text, k=5):
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
-    text_emb = get_text_embs(text)
     scores, samples = dataset.get_nearest_examples('img_embs', text_emb, k=k)
     return postprocess_results(scores, samples)
@@ -240,7 +267,7 @@ def image_to_image(image, k=5):
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
-    img_emb = get_image_embs(image.name)
     scores, samples = dataset.get_nearest_examples('img_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
@@ -546,13 +573,12 @@ if not check_password():
     st.stop()
 # Check if the directory exists
-if not os.path.exists(model_dir):
-    download_models()
 dataset = load_dataset()
 df = load_dataframe(dataset)
-image_model = load_img_model()
-text_model = load_txt_model()
 menu_options = ["Data exploration", "Semantic search", "Hashtags", "Clustering", "Stats"]
@@ -868,7 +894,7 @@ elif selected_menu_option == "Clustering":
     st.markdown("### Clustering Results")
     if type_embeddings == "Text":
-        embeddings = dataset['txt_embs']
     elif type_embeddings == "Image":
         embeddings = dataset['img_embs']

 import streamlit as st
 import datasets
 import plotly.express as px
+from transformers import AutoProcessor, AutoModel
 from PIL import Image
 import os
 from pandas.api.types import (
 #st.set_page_config(layout="wide")
+model_name = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 token_ = st.secrets["token"]
+@st.cache_resource(show_spinner=True)
+def load_model(model_name):
+    """
+    Load the model and processor
+    """
+    processor = AutoProcessor.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    return processor, model
 @st.cache_data(show_spinner=True)
 def load_dataset():
+    dataset = datasets.load_dataset('rjadr/ditaduranuncamais', split='train', token=token_)
+    dataset.add_faiss_index(column="text_embs")
     dataset.add_faiss_index(column="img_embs")
     dataset = dataset.remove_columns(['Post Created Date', 'Post Created Time','Like and View Counts Disabled','Link','Download URL','Views'])
     return dataset
 @st.cache_data(show_spinner=False)
 def load_dataframe(_dataset):
+    dataframe = _dataset.remove_columns(['text_embs', 'img_embs']).to_pandas()
     # Extract hashtags ith regex and convert to set
     dataframe['Hashtags'] = dataframe.apply(lambda row: f"{row['Description']} {row['Image Text']}", axis=1)
     dataframe['Hashtags'] = dataframe['Hashtags'].str.lower().str.findall(r'#(\w+)').apply(set)
     dataframe = dataframe[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in dataframe.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
     return dataframe
 def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     return df
 @st.cache_data
+def get_image_embs(_processor, _model, uploaded_file):
     """
     Get image embeddings
     Parameters:
+    processor (transformers.AutoProcessor): Processor for the model
+    model (transformers.AutoModel): Model to use for embeddings
     uploaded_file (PIL.Image): Uploaded image file
     Returns:
     img_emb (np.array): Image embeddings
     """
+    # Load the image from local path
+    image = Image.open(uploaded_file)
+    # Process the image
+    inputs = _processor(images=image, return_tensors="pt")
+    # Forward pass without gradient calculation
+    outputs = _model.get_image_features(**inputs)
+    # Normalize the image embeddings
+    img_embs = outputs / outputs.norm(dim=-1, keepdim=True)
+    # Convert to list and add to example
+    img_emb = img_embs.squeeze(0).detach().cpu().numpy()
     return img_emb
 @st.cache_data(show_spinner=False)
+def get_text_embs(_processor, _model, text):
     """
     Get text embeddings
     Parameters:
+    processor (transformers.AutoProcessor): Processor for the model
+    model (transformers.AutoModel): Model to use for embeddings
     text (str): Text to encode
     Returns:
     text_emb (np.array): Text embeddings
     """
+    # Process the text with truncation
+    inputs = _processor(
+        text=text,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=77  # CLIP's maximum sequence length
+    )
+    # Forward pass without gradient calculation
+    outputs = _model.get_text_features(**inputs)
+    # Normalize the text embeddings
+    text_embs = outputs / outputs.norm(dim=-1, keepdim=True)
+    # Convert to list and add to example
+    txt_emb = text_embs.squeeze(0).detach().cpu().numpy()
     return txt_emb
 @st.cache_data
     samples_df["score"] = samples_df["score"].astype(int)
     samples_df.reset_index(inplace=True, drop=True)
     samples_df = samples_df[['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name'] + [col for col in samples_df.columns if col not in ['Post Created', 'image', 'Description', 'Image Text', 'Account', 'User Name']]]
+    return samples_df.drop(columns=['text_embs', 'img_embs'])
 @st.cache_data
 def text_to_text(text, k=5):
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
+    text_emb = get_text_embs(processor, model, text)
+    scores, samples = dataset.get_nearest_examples('text_embs', text_emb, k=k)
     return postprocess_results(scores, samples)
 @st.cache_data
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
+    img_emb = get_image_embs(processor, model, image.name)
+    scores, samples = dataset.get_nearest_examples('text_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
 @st.cache_data
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
+    text_emb = get_text_embs(processor, model, text)
     scores, samples = dataset.get_nearest_examples('img_embs', text_emb, k=k)
     return postprocess_results(scores, samples)
     Returns:
     results (list): List of tuples of PIL images and labels/scores
     """
+    img_emb = get_image_embs(processor, model, image.name)
     scores, samples = dataset.get_nearest_examples('img_embs', img_emb, k=k)
     return postprocess_results(scores, samples)
     st.stop()
 # Check if the directory exists
 dataset = load_dataset()
 df = load_dataframe(dataset)
+processor, model = load_model(model_name)
+#image_model = load_img_model()
+#text_model = load_txt_model()
 menu_options = ["Data exploration", "Semantic search", "Hashtags", "Clustering", "Stats"]
     st.markdown("### Clustering Results")
     if type_embeddings == "Text":
+        embeddings = dataset['text_embs']
     elif type_embeddings == "Image":
         embeddings = dataset['img_embs']