Spaces:

somewheresystems
/

dataclysm

Paused

File size: 11,410 Bytes

21bee4f

# Import necessary libraries
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from datasets import load_dataset, Dataset
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import time
import logging


# Additional libraries for querying
from FlagEmbedding import FlagModel

# Global variables and dataset loading
global dataset_name
dataset_name = 'somewheresystems/dataclysm-arxiv'
st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
total_samples = len(st.session_state.dataclysm_arxiv)

logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
# Load the dataset once at the start
# Initialize the model for querying
model = FlagModel('BAAI/bge-small-en-v1.5', query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", use_fp16=True)


def load_data(num_samples):
    start_time = time.time() 
    dataset_name = 'somewheresystems/dataclysm-arxiv'
    # Load the dataset
    logging.info(f'Loading dataset...')
    dataset = load_dataset(dataset_name)
    total_samples = len(dataset['train'])

    logging.info('Converting to pandas dataframe...')
    # Convert the dataset to a pandas DataFrame
    df = dataset['train'].to_pandas()

    # Adjust num_samples if it's more than the total number of samples
    num_samples = min(num_samples, total_samples)
    st.sidebar.text(f'Number of samples: {num_samples} ({num_samples / total_samples:.2%} of total)')

    # Randomly sample the dataframe
    df = df.sample(n=num_samples)

    # Assuming 'embeddings' column contains the embeddings
    embeddings = df['title_embedding'].tolist()
    print("embeddings length: " + str(len(embeddings)))

    # Convert list of lists to numpy array
    embeddings = np.array(embeddings, dtype=object)
    end_time = time.time()  # End timing
    st.sidebar.text(f'Data loading completed in {end_time - start_time:.3f} seconds')
    return df, embeddings

def perform_tsne(embeddings):
    start_time = time.time() 
    logging.info('Performing t-SNE...')

    n_samples = len(embeddings)
    perplexity = min(30, n_samples - 1) if n_samples > 1 else 1

    # Check if all embeddings have the same length
    if len(set([len(embed) for embed in embeddings])) > 1:
        raise ValueError("All embeddings should have the same length")

    # Dimensionality Reduction with t-SNE
    tsne = TSNE(n_components=3, perplexity=perplexity, n_iter=300)

    # Create a placeholder for progress bar
    progress_text = st.empty()
    progress_text.text("t-SNE in progress...")

    tsne_results = tsne.fit_transform(np.vstack(embeddings.tolist()))

    # Update progress bar to indicate completion
    progress_text.text(f"t-SNE completed. Processed {n_samples} samples with perplexity {perplexity}.")
    end_time = time.time()  # End timing
    st.sidebar.text(f't-SNE completed in {end_time - start_time:.3f} seconds')
    return tsne_results


def perform_clustering(df, tsne_results):
    start_time = time.time() 
    # Perform KMeans clustering
    logging.info('Performing k-means clustering...')
    # Step 3: Visualization with Plotly
    df['tsne-3d-one'] = tsne_results[:,0]
    df['tsne-3d-two'] = tsne_results[:,1]
    df['tsne-3d-three'] = tsne_results[:,2]

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=16)  # Change the number of clusters as needed
    df['cluster'] = kmeans.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
    end_time = time.time()  # End timing
    st.sidebar.text(f'k-means clustering completed in {end_time - start_time:.3f} seconds')
    return df

def main():
    # Custom CSS
    custom_css = """
    <style>
        /* Define the font */
        @font-face {
            font-family: 'F';
            src: url('https://fonts.googleapis.com/css2?family=Martian+Mono&display=swap') format('truetype');
        }
        /* Apply the font to all elements */
        * {
            font-family: 'F', sans-serif !important;
            color: #F8F8F8; /* Set the font color to F8F8F8 */
        }
        /* Add your CSS styles here */
        h1 {
            text-align: center;
        }
        h2,h3,h4 {
            text-align: justify;
            font-size: 8px
        }
        body {
            text-align: justify;
        }
        .stSlider .css-1cpxqw2 {
            background: #202020;
        }
        .stButton > button {
            background-color: #202020;
            width: 100%;
            border: none;
            padding: 10px 24px;
            border-radius: 5px;
            font-size: 16px;
            font-weight: bold;
        }
        .reportview-container .main .block-container {
            padding: 2rem;
            background-color: #202020;
        }
    </style>
    """

    # Inject custom CSS with markdown
    st.markdown(custom_css, unsafe_allow_html=True)
    st.sidebar.markdown(
        f'<img src="https://www.somewhere.systems/S2-white-logo.png" style="float: bottom-left; width: 32px; height: 32px; opacity: 1.0; animation: fadein 2s;">', 
        unsafe_allow_html=True
    )
    st.sidebar.title('Spatial Search Engine')

    # Check if data needs to be loaded
    if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
        # User input for number of samples
        num_samples = st.sidebar.slider('Select number of samples', 1000, total_samples, 1000)

        if st.sidebar.button('Initialize'):
            st.sidebar.text('Initializing data pipeline...')

            # Define a function to reshape the embeddings and add FAISS index if it doesn't exist
            def reshape_and_add_faiss_index(dataset, column_name):
                
                # Ensure the shape of the embedding is (1000, 384) and not (1000, 1, 384)
                # As each row in title_embedding is shaped like this: [[-0.08477783203125, -0.009719848632812, ...]]
                # We need to flatten it to [-0.08477783203125, -0.009719848632812, ...]
                print(f"Flattening {column_name} and adding FAISS index...")
                # Flatten the embeddings
                dataset[column_name] = dataset[column_name].apply(lambda x: np.array(x).flatten())
                # Add the FAISS index
                dataset = Dataset.from_pandas(dataset).add_faiss_index(column=column_name)
                print(f"FAISS index for {column_name} added.")
                
                return dataset
                

            
            # Load data and perform t-SNE and clustering
            df, embeddings = load_data(num_samples)

            # Combine embeddings and df back into one df
            # Convert embeddings to list of lists before assigning to df
            embeddings_list = [embedding.flatten().tolist() for embedding in embeddings]
            df['title_embedding'] = embeddings_list
            # Print the first few rows of the dataframe to check
            print(df.head())
            # Add FAISS indices for 'title_embedding' 
            st.session_state.dataclysm_title_indexed = reshape_and_add_faiss_index(df, 'title_embedding')
            tsne_results = perform_tsne(embeddings)
            df = perform_clustering(df, tsne_results)
            # Store results in session state
            st.session_state.df = df
            st.session_state.tsne_results = tsne_results
            st.session_state.data_loaded = True
        
            # Create custom hover text
            df['hovertext'] = df.apply(
                lambda row: f"<b>Title:</b> {row['title']}<br><b>arXiv ID:</b> {row['id']}<br><b>Key:</b> {row.name}", axis=1
            )
            st.sidebar.text("Datasets loaded, titles indexed.")

            # Create the plot
            fig = go.Figure(data=[go.Scatter3d(
                x=df['tsne-3d-one'],
                y=df['tsne-3d-two'],
                z=df['tsne-3d-three'],
                mode='markers',
                hovertext=df['hovertext'],
                hoverinfo='text',
                marker=dict(
                    size=1,
                    color=df['cluster'],
                    colorscale='Viridis',
                    opacity=0.8
                )
            )])

            fig.update_layout(
                plot_bgcolor='#202020',
                height=800,
                margin=dict(l=0, r=0, b=0, t=0),
                scene=dict(
                    xaxis=dict(showbackground=True, backgroundcolor="#000000"),
                    yaxis=dict(showbackground=True, backgroundcolor="#000000"),
                    zaxis=dict(showbackground=True, backgroundcolor="#000000"),
                ),
                scene_camera=dict(eye=dict(x=0.001, y=0.001, z=0.001))
            )
            st.session_state.fig = fig

    # Display the plot if data is loaded
    if 'data_loaded' in st.session_state and st.session_state.data_loaded:
        st.plotly_chart(st.session_state.fig, use_container_width=True)


    # Sidebar for detailed view
    if 'df' in st.session_state:
        # Sidebar for querying
        with st.sidebar:
            st.sidebar.markdown("### Query Embeddings")
            query = st.text_input("Enter your query:")
            if st.button("Search"):
                # Define the model
                print("Initializing model...")
                model = FlagModel('BAAI/bge-small-en-v1.5', 
                                query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                                use_fp16=True)
                print("Model initialized.")
                
                query_embedding = model.encode([query])
                # Retrieve examples by title similarity (or abstract, depending on your preference)
                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
                df_query = pd.DataFrame(retrieved_examples_title)
                df_query['proximity'] = scores_title
                df_query = df_query.sort_values(by='proximity', ascending=True)
                # Limit similarity score to 3 decimal points
                df_query['proximity'] = df_query['proximity'].round(3)
                # Fix the <a href link> to display properly
                df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
                st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
            st.sidebar.markdown("# Detailed View")
            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)

            # Display metadata for the selected article
            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
            
        

if __name__ == "__main__":
    main()