In [2]:
import numpy as np
import plotly.express as px

def farthest_first_traversal(points, k, return_type='indices'):
    if return_type == 'values': 
        # randomly select first point from the dataset
        centers = [points[np.random.randint(0, len(points))]]
        
        while len(centers) < k:
            # compute distances from all points to the selected centers
            distances = np.array([min([np.linalg.norm(p-c) for c in centers]) for p in points])
            
            # select the point farthest away from the centers
            farthest = points[np.argmax(distances)]
            
            # add the farthest point to the selected centers
            centers.append(farthest)
        
        return np.array(centers) #given a list, directly returns values

    elif return_type == 'indices':
        # randomly select first point from the dataset
        centers = [np.random.randint(0, len(points))]
        indices = [centers[0]]
        
        while len(centers) < k:
            # compute distances from all points to the selected centers
            distances = np.array([min([np.linalg.norm(points[p]-points[c]) for c in centers]) for p in range(len(points))])
            
            # select the point farthest away from the centers
            farthest = np.argmax(distances)
            
            # add the farthest point to the selected centers
            centers.append(farthest)
            indices.append(farthest)
        
        return np.array(indices) #given a list, returns the indices of the list

# generate sample data
# np.random.seed(42)
points = np.random.randn(100, 20) #last param is the n. dimensions

centers = farthest_first_traversal(points, k=10, return_type='values')
# visualize results using Plotly
fig = px.scatter(x=points[:,0], y=points[:,1])
fig.add_scatter(x=centers[:,0], y=centers[:,1], mode='markers', marker=dict(size=10, color='red'))
fig.show()

In [1]:
import os
# os.system('pip install openpyxl')
# os.system('pip install sentence-transformers')
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2

df = pd.read_parquet('df_encoded3.parquet')
df['tags'] = df['tags'].apply(lambda x : str(x))
def parse_raised(x):
    if x == 'Undisclosed':
        return 0
    else: 
        quantifier = x[-1]
        x = float(x[1:-1])
        if quantifier == 'K':
            return x/1000
        elif quantifier == 'M':
            return x
df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
df['stage'] = df['stage'].apply(lambda x : x.lower())
df = df.reset_index(drop=True)

from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sentence_transformers import SentenceTransformer

nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())

def search(df, query):
    product = model.encode(query).tolist()
    # product = df.iloc[0]['text_vector_'] #use one of the products as sample

    #prepare model
    # 
    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object

    #print out the description of every recommended product
    return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags', 'text_vector_']]

def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):
    if filter_type == '==':
        df_filtered = df[df[column_name]==filter_value]
    elif filter_type == '>=':
        df_filtered = df[df[column_name]>=filter_value]
    elif filter_type == '<=':
        df_filtered = df[df[column_name]<=filter_value]
    elif filter_type == 'contains':
        df_filtered = df[df['target'].str.contains(filter_value)]

    if df_filtered.size >= minimum_acceptable_size:
        return df_filtered
    else:
        return df

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def score_filter(df, query, min_score):
    # Define function to compute cosine similarity between two vectors
    def cosine_sim(query, vector):
        return cosine_similarity([query], [vector])[0][0]

    # df_results = search(df, 'age reversing')[0:50]
    vector_col = np.array(df['text_vector_'].tolist())

    # Define query vector
    query = model.encode([query])[0]

    # Compute cosine similarity between query vector and every sample vector
    df['similarity'] = np.apply_along_axis(cosine_sim, 1, vector_col, query)
    df = df[df['similarity']>=min_score]
    return df



In [4]:
df_results = search(df, 'age-reversing')[0:20]
points = df_results['text_vector_'].values
indices = farthest_first_traversal(points, k=5, return_type='indices')
df_results.iloc[indices][['name', 'description']].values

array([['Klogene Therapeutics, Inc.',
        'Prevention and treatment of age related diseases'],
       ['Inverse',
        'Inverse is changing the way women all around the world condition and care for their hair.'],
       ['AGELON М',
        'Agelon M is the service of online investigations based on automated targeting, verification and surveying respondents in social networks.'],
       ['Age of Learning',
        'Age of Learning blends education best practices, innovative technology, and insightful creativity to bring learning to life for children across the U.S. and around the world.'],
       ['Aprilage Inc',
        'Seeing IS believing!  Aprilage develops visualization software that shows people their "future self" and how their lifestyle of today will affect how they will look as they age. Our software, APRIL®, is currently used by 500 health providers, educators and insurers in more than 25 countries as a tool for health education about chronic disease prevention and beh

In [48]:
import requests

def gpt3_question(api_key, prompt):
    api_endpoint = "https://api.openai.com/v1/engines/text-davinci-003/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "prompt": prompt,
        "max_tokens": 500,
        "temperature": 0.7
    }
    print('sending request')
    response = requests.post(api_endpoint, headers=headers, json=data)
    print(response.text)
    generated_text = response.json()["choices"][0]["text"]

    return generated_text

def competitor_analysis_foo(startup_array, max_paragraphs):
    prompt = f"""
    {str(startup_array)}
    This is a list of startups in the following format: [name, stage, description]:

    Write a {max_paragraphs} paragraph competitors analysis based on this data. Do not name the paragraphs.
    """
    #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
    response = gpt3_question('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', prompt)

    for x in range(10):
        response = response.replace(f'Paragraph {x}:', '')
        response = response.replace(f'Paragraph {x}', '')
        response = response.replace('\n\n', '\n').strip()

    # with open('competitor_analysis.txt', 'w') as file:
    #     file.write(response)
    return response

In [51]:
#the first module becomes text1, the second module file1
def vector_search(size, target, stage, query, var_metadata, var_fresh): #greet('11-500+', 'B2B', 'pre-seed', 'age-reversing')
    def raised_zero(x):
        if x == 0:
            return 'Undisclosed'
        else:
            return x
    df_knn = search(df, query)
    df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))

    df_size = filter_df(df_knn, 'size', '==', size, 1)

    if stage != 'ALL':
        df_stage = filter_df(df_size, 'stage', '==', stage.lower(), 1)
    else:
        #we bypass the filter
        df_stage = df_size

    df_target = filter_df(df_stage, 'target', 'contains', target, 1)
    
    # display(df_stage)
    # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]

    return df_target.drop('text_vector_', axis=1)[0:100], df_target[0:100], True #.sort_values('raised', ascending=False)

def write_competitor_analysis(var_metadata, query, var_fresh):

    if var_fresh == True:
        df_final = score_filter(var_metadata, query, 0.35)
        df_final = df_final[['name', 'stage', 'description']][0:10].values.tolist()

        if len(df_final) == 0:
            # df_final = df_final[['name', 'stage', 'description']][0:3].values.tolist()
            # response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=1)
            response = 'score too low to output valid results'
        if len(df_final) >= 1 and len(df_final) <= 3:
            response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=1)
        elif len(df_final) > 3 and len(df_final) <= 5:
            response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=2)
        elif len(df_final) > 6:
            response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=3)

        return response, False #we reset fresh state

    else:
        return 'Perform a new Startup Search first', False #we reset fresh state

with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
    gr.Markdown(
    """
    # Startup Search Engine
    """
    )
    var_fresh = gr.Variable(value=False)
    var_metadata = gr.Variable(value=0)
    var_query = gr.Variable(value=0)
    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')
    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')
    stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'ALL'], multiselect=False, value='ALL', label='stage')
    # raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
    query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
    # competitor_analysis = gr.Radio(['write', 'do not write'], multiselect=False, value='do not write', label='write a competitor analysis')

    btn2 = gr.Button(value="Search for a Startup")
    btn1 = gr.Button(value="Write a competitor analysis")
    
    output1 = gr.Textbox(label='competitor analysis')
    output2 = gr.DataFrame(label='value')

    btn1.click(write_competitor_analysis, [var_metadata, query, var_fresh], [output1, var_fresh]) #competitor analysis
    btn2.click(vector_search, [size, target, stage, query, var_metadata, var_fresh], [output2, var_metadata, var_fresh]) #startup search

demo.launch(share=False)



Running on local URL:  http://127.0.0.1:7901

To create a public link, set `share=True` in `launch()`.




sending request
{"id":"cmpl-734OdouEI70awj0YgzW2v9fLQCusE","object":"text_completion","created":1680966239,"model":"text-davinci-003","choices":[{"text":"\n    Paragraph 1: \n    There appears to be a large focus on the pre-seed stage in this list of startups. AgeRate, Klogene Therapeutics, Inc., Modern Age, Age Labs, Assured Allies, Spring Discovery, AgeNation, and Elevian are all pre-seed startups that provide a variety of services related to aging. AgeRate and Age Labs are focused on providing accurate and affordable epigenetic tests to reveal a person’s biological age. Klogene Therapeutics, Inc. is focused on prevention and treatment of age-related diseases. Modern Age is a health and wellness platform designed to make the journey of aging more manageable. Assured Allies is a company focused on successful aging. Spring Discovery is focused on drug discovery for age-related diseases, and AgeNation is a digital media company for baby boomers and seniors. Lastly, Elevian is a molecula