{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Pre-Funding', 'Seed', 'A', 'Mature', 'C', 'Public', 'D',\n",
       "       'Pre-Seed', 'B', 'Debt Financing', 'F', 'Crowdfunding', 'E'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.stage.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 3., 1., 4., 2., 5.])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.stage.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "      <th>year</th>\n",
       "      <th>target</th>\n",
       "      <th>size</th>\n",
       "      <th>stage</th>\n",
       "      <th>raised</th>\n",
       "      <th>tags</th>\n",
       "      <th>country</th>\n",
       "      <th>source</th>\n",
       "      <th>text_vector_</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.10 of a Second</td>\n",
       "      <td>Smart Indicators for Connected Vehicles</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12trix</td>\n",
       "      <td>Math Lessons for Young Kids</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>B2B, B2C</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1E Therapeutics</td>\n",
       "      <td>Novel RNA-targeting Drugs</td>\n",
       "      <td>2021.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>Seed</td>\n",
       "      <td>$120M</td>\n",
       "      <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1MRobotics</td>\n",
       "      <td>Retail Automation Solutions with Nano Fulfillment</td>\n",
       "      <td>2021.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>11-50</td>\n",
       "      <td>A</td>\n",
       "      <td>$25M</td>\n",
       "      <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1touch.io</td>\n",
       "      <td>Personal Data Flow Tracking and Data Cataloging</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>A</td>\n",
       "      <td>$16.1M</td>\n",
       "      <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4981</th>\n",
       "      <td>YOW HR</td>\n",
       "      <td>Human Resources Engagement and Optimization Pl...</td>\n",
       "      <td>2020.0</td>\n",
       "      <td>B2B, B2B2C</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[content-creators, e-learning, software-applic...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4982</th>\n",
       "      <td>Yummi Home Food</td>\n",
       "      <td>Marketplace for Homemade Food</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>B2C</td>\n",
       "      <td>11-50</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4983</th>\n",
       "      <td>Yung-Etgar</td>\n",
       "      <td>Custom Mechanized Harvesting Systems</td>\n",
       "      <td>1982.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>Mature</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4984</th>\n",
       "      <td>YuviTal</td>\n",
       "      <td>Digital Health and Fitness Solutions for Organ...</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>B2B, B2C, B2G</td>\n",
       "      <td>11-50</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4985</th>\n",
       "      <td>Z-square</td>\n",
       "      <td>Microendoscope for Minimally Invasive Imaging ...</td>\n",
       "      <td>2013.0</td>\n",
       "      <td>B2B</td>\n",
       "      <td>11-50</td>\n",
       "      <td>Seed</td>\n",
       "      <td>$10M</td>\n",
       "      <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4986 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  name                                        description  \\\n",
       "0     0.10 of a Second           Smart Indicators for Connected Vehicles    \n",
       "1               12trix                        Math Lessons for Young Kids   \n",
       "2      1E Therapeutics                          Novel RNA-targeting Drugs   \n",
       "3           1MRobotics  Retail Automation Solutions with Nano Fulfillment   \n",
       "4            1touch.io    Personal Data Flow Tracking and Data Cataloging   \n",
       "...                ...                                                ...   \n",
       "4981            YOW HR  Human Resources Engagement and Optimization Pl...   \n",
       "4982   Yummi Home Food                      Marketplace for Homemade Food   \n",
       "4983        Yung-Etgar               Custom Mechanized Harvesting Systems   \n",
       "4984           YuviTal  Digital Health and Fitness Solutions for Organ...   \n",
       "4985          Z-square  Microendoscope for Minimally Invasive Imaging ...   \n",
       "\n",
       "        year         target    size        stage       raised  \\\n",
       "0     2019.0            B2B    1-10  Pre-Funding  Undisclosed   \n",
       "1     2012.0       B2B, B2C    1-10  Pre-Funding  Undisclosed   \n",
       "2     2021.0            B2B  51-200         Seed        $120M   \n",
       "3     2021.0            B2B   11-50            A         $25M   \n",
       "4     2017.0            B2B  51-200            A       $16.1M   \n",
       "...      ...            ...     ...          ...          ...   \n",
       "4981  2020.0     B2B, B2B2C    1-10  Pre-Funding  Undisclosed   \n",
       "4982  2012.0            B2C   11-50  Pre-Funding  Undisclosed   \n",
       "4983  1982.0            B2B  51-200       Mature  Undisclosed   \n",
       "4984  2017.0  B2B, B2C, B2G   11-50  Pre-Funding  Undisclosed   \n",
       "4985  2013.0            B2B   11-50         Seed         $10M   \n",
       "\n",
       "                                                   tags country  \\\n",
       "0     [connected-vehicles, adas, autonomous-vehicles...  Israel   \n",
       "1     [sdg, schools, pre-k, serious-games, games, mo...  Israel   \n",
       "2     [pharmaceuticals, chronic-disease, immunology,...  Israel   \n",
       "3     [omni-channel, ecommerce, climate-tech, artifi...  Israel   \n",
       "4     [enterprise-solutions, data-protection, cyber-...  Israel   \n",
       "...                                                 ...     ...   \n",
       "4981  [content-creators, e-learning, software-applic...  Israel   \n",
       "4982  [ecommerce, p2p, delivery, online-shopping, ma...  Israel   \n",
       "4983  [crops, agtech, harvesting, machinery, sdg, cl...  Israel   \n",
       "4984  [fitness, digital-wallet, discount, mobile-app...  Israel   \n",
       "4985  [endoscopy, medical-devices, minimally-invasiv...  Israel   \n",
       "\n",
       "                                        source  \\\n",
       "0     https://finder.startupnationcentral.org/   \n",
       "1     https://finder.startupnationcentral.org/   \n",
       "2     https://finder.startupnationcentral.org/   \n",
       "3     https://finder.startupnationcentral.org/   \n",
       "4     https://finder.startupnationcentral.org/   \n",
       "...                                        ...   \n",
       "4981  https://finder.startupnationcentral.org/   \n",
       "4982  https://finder.startupnationcentral.org/   \n",
       "4983  https://finder.startupnationcentral.org/   \n",
       "4984  https://finder.startupnationcentral.org/   \n",
       "4985  https://finder.startupnationcentral.org/   \n",
       "\n",
       "                                           text_vector_  \n",
       "0     [-0.031224824488162994, -0.06342269480228424, ...  \n",
       "1     [-0.038649097084999084, 0.028091922402381897, ...  \n",
       "2     [0.04561534896492958, -0.017776092514395714, 0...  \n",
       "3     [0.0024080690927803516, -0.03042100928723812, ...  \n",
       "4     [-0.01007091999053955, 0.10431888699531555, -0...  \n",
       "...                                                 ...  \n",
       "4981  [0.026961881667375565, 0.002459645736962557, -...  \n",
       "4982  [0.0036857957020401955, 0.03582162782549858, -...  \n",
       "4983  [0.027293115854263306, 0.010461761616170406, 0...  \n",
       "4984  [0.02851911261677742, 0.05474231392145157, -0....  \n",
       "4985  [0.012587728910148144, -0.07959864288568497, -...  \n",
       "\n",
       "[4986 rows x 11 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df1 = pd.read_parquet('df_encoded.parquet')\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "      <th>stage</th>\n",
       "      <th>tags</th>\n",
       "      <th>url</th>\n",
       "      <th>country</th>\n",
       "      <th>text_vector_</th>\n",
       "      <th>raised</th>\n",
       "      <th>size</th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Digipal</td>\n",
       "      <td>Digipal is a digital consultancy based in Tbil...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>https://www.digipal.agency/</td>\n",
       "      <td>georgia</td>\n",
       "      <td>[0.017287444323301315, 0.06208805367350578, -0...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BeatBind</td>\n",
       "      <td>BeatBind is the industry's long overdue platfo...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[social, leisure]</td>\n",
       "      <td>https://beatbind.io/</td>\n",
       "      <td>georgia</td>\n",
       "      <td>[-0.00438214186578989, -0.051213208585977554, ...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Smart Academy</td>\n",
       "      <td>Smart Academy is a modern educational institut...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[edtech]</td>\n",
       "      <td>https://smartacademy.ge/</td>\n",
       "      <td>georgia</td>\n",
       "      <td>[0.0005468669114634395, -0.05331585183739662, ...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>MaxinAI</td>\n",
       "      <td>MaxinAI isglobal AI development company that w...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>https://www.maxinai.com/#all-industries</td>\n",
       "      <td>georgia</td>\n",
       "      <td>[0.021948501467704773, 0.024166792631149292, -...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TLANCER</td>\n",
       "      <td>Tlancer aims to create an unlimited educationa...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[edtech]</td>\n",
       "      <td>https://www.tlancer.ge/</td>\n",
       "      <td>georgia</td>\n",
       "      <td>[0.02025573141872883, -0.022812215611338615, -...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94521</th>\n",
       "      <td>OneTwo</td>\n",
       "      <td>klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>www.nethr</td>\n",
       "      <td>croatia</td>\n",
       "      <td>[0.07235302031040192, -0.05674564838409424, -0...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94522</th>\n",
       "      <td>Trialfire</td>\n",
       "      <td>Engaged trialers turn into customers, engaged ...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>http://www.trialfire.com</td>\n",
       "      <td>canada</td>\n",
       "      <td>[0.030764097347855568, 0.054082825779914856, -...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94523</th>\n",
       "      <td>ILLUMAGEAR</td>\n",
       "      <td>ILLUMAGEAR’s mission is to illuminate people a...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>http://www.illumagear.com</td>\n",
       "      <td>united-states</td>\n",
       "      <td>[0.015447210520505905, -0.0984775498509407, 0....</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94524</th>\n",
       "      <td>Knowillage</td>\n",
       "      <td>Knowillage lets you add personalization to you...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[edtech]</td>\n",
       "      <td>http://www.knowillage.com</td>\n",
       "      <td>canada</td>\n",
       "      <td>[0.007970919832587242, -0.04347420111298561, -...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94525</th>\n",
       "      <td>Iris Holidays</td>\n",
       "      <td>Iris Holidays is a full service Kerala tours o...</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>http://www.irisholidays.com</td>\n",
       "      <td>india</td>\n",
       "      <td>[0.0032976483926177025, -0.010843133553862572,...</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>94526 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 name                                        description  \\\n",
       "0            Digipal   Digipal is a digital consultancy based in Tbil...   \n",
       "1            BeatBind  BeatBind is the industry's long overdue platfo...   \n",
       "2       Smart Academy  Smart Academy is a modern educational institut...   \n",
       "3             MaxinAI  MaxinAI isglobal AI development company that w...   \n",
       "4             TLANCER  Tlancer aims to create an unlimited educationa...   \n",
       "...               ...                                                ...   \n",
       "94521          OneTwo       klkdčksč kdč skdčlsk čdksčd ksčk dčskdčk čdk   \n",
       "94522       Trialfire  Engaged trialers turn into customers, engaged ...   \n",
       "94523      ILLUMAGEAR  ILLUMAGEAR’s mission is to illuminate people a...   \n",
       "94524      Knowillage  Knowillage lets you add personalization to you...   \n",
       "94525  Iris Holidays   Iris Holidays is a full service Kerala tours o...   \n",
       "\n",
       "          stage               tags                                      url  \\\n",
       "0      pre-seed   [software, data]              https://www.digipal.agency/   \n",
       "1      pre-seed  [social, leisure]                     https://beatbind.io/   \n",
       "2      pre-seed           [edtech]                 https://smartacademy.ge/   \n",
       "3      pre-seed   [software, data]  https://www.maxinai.com/#all-industries   \n",
       "4      pre-seed           [edtech]                  https://www.tlancer.ge/   \n",
       "...         ...                ...                                      ...   \n",
       "94521  pre-seed   [software, data]                                www.nethr   \n",
       "94522  pre-seed   [software, data]                 http://www.trialfire.com   \n",
       "94523  pre-seed   [software, data]                http://www.illumagear.com   \n",
       "94524  pre-seed           [edtech]                http://www.knowillage.com   \n",
       "94525  pre-seed   [software, data]              http://www.irisholidays.com   \n",
       "\n",
       "             country                                       text_vector_  \\\n",
       "0            georgia  [0.017287444323301315, 0.06208805367350578, -0...   \n",
       "1            georgia  [-0.00438214186578989, -0.051213208585977554, ...   \n",
       "2            georgia  [0.0005468669114634395, -0.05331585183739662, ...   \n",
       "3            georgia  [0.021948501467704773, 0.024166792631149292, -...   \n",
       "4            georgia  [0.02025573141872883, -0.022812215611338615, -...   \n",
       "...              ...                                                ...   \n",
       "94521        croatia  [0.07235302031040192, -0.05674564838409424, -0...   \n",
       "94522         canada  [0.030764097347855568, 0.054082825779914856, -...   \n",
       "94523  united-states  [0.015447210520505905, -0.0984775498509407, 0....   \n",
       "94524         canada  [0.007970919832587242, -0.04347420111298561, -...   \n",
       "94525          india  [0.0032976483926177025, -0.010843133553862572,...   \n",
       "\n",
       "            raised     size                        source       target  \n",
       "0      Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "1      Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "2      Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "3      Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "4      Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "...            ...      ...                           ...          ...  \n",
       "94521  Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "94522  Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "94523  Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "94524  Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "94525  Undisclosed  11-500+  https://www.startupblink.com  Undisclosed  \n",
       "\n",
       "[94526 rows x 11 columns]"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stage_dict = {\n",
    "    0 : \"pre-seed\",\n",
    "    1 : \"seed\",\n",
    "    2 : \"A\",\n",
    "    3 : \"B\",\n",
    "    4 : \"C\",\n",
    "    5 : \"Exit\",\n",
    "}\n",
    "\n",
    "df2 = pd.read_parquet('df_encoded2.parquet')\n",
    "df2 = df2[['title', 'description', 'stage', 'industry_name', 'url', 'country_slug', 'text_vector_']]\n",
    "df2['stage'] = df2['stage'].apply(lambda x : stage_dict[x])\n",
    "df2['raised'] = 'Undisclosed'\n",
    "df2['size'] = '11-500+'\n",
    "df2['source'] = 'https://www.startupblink.com'\n",
    "df2['target'] = 'Undisclosed'\n",
    "df2['industry_name']= df2['industry_name'].apply(lambda x : [a.strip().lower() for a in x.split('&')])\n",
    "df2.columns = ['name', 'description', 'stage', 'tags', 'url', 'country', 'text_vector_', 'raised', 'size', 'source', 'target']\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = pd.concat([df1, df2], ignore_index=True, axis=0).fillna('').drop(['year'], axis=1)\n",
    "df3\n",
    "df3.to_parquet('df_encoded3.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "      <th>target</th>\n",
       "      <th>size</th>\n",
       "      <th>stage</th>\n",
       "      <th>raised</th>\n",
       "      <th>tags</th>\n",
       "      <th>country</th>\n",
       "      <th>source</th>\n",
       "      <th>text_vector_</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.10 of a Second</td>\n",
       "      <td>Smart Indicators for Connected Vehicles</td>\n",
       "      <td>B2B</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12trix</td>\n",
       "      <td>Math Lessons for Young Kids</td>\n",
       "      <td>B2B, B2C</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1E Therapeutics</td>\n",
       "      <td>Novel RNA-targeting Drugs</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>Seed</td>\n",
       "      <td>$120M</td>\n",
       "      <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1MRobotics</td>\n",
       "      <td>Retail Automation Solutions with Nano Fulfillment</td>\n",
       "      <td>B2B</td>\n",
       "      <td>11-50</td>\n",
       "      <td>A</td>\n",
       "      <td>$25M</td>\n",
       "      <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1touch.io</td>\n",
       "      <td>Personal Data Flow Tracking and Data Cataloging</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>A</td>\n",
       "      <td>$16.1M</td>\n",
       "      <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>24me</td>\n",
       "      <td>Next-generation Personal Assistant</td>\n",
       "      <td>B2C</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Seed</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[time-management, scheduling, calendars, artif...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.035849399864673615, 0.04990792274475098, -0...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>270Surgical</td>\n",
       "      <td>Specialty Laparoscopic System for Wide Cavity ...</td>\n",
       "      <td>B2B, B2C</td>\n",
       "      <td>11-50</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[endoscopy, surgery, operating-rooms, optics, ...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.00110541470348835, 0.011574415490031242, 0...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2bPrecise</td>\n",
       "      <td>Precision Medicine Solution</td>\n",
       "      <td>B2B</td>\n",
       "      <td>51-200</td>\n",
       "      <td>Mature</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[decision-making, predictive-analytics, cardio...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[0.01863308809697628, 0.03877090662717819, -0....</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2breathe Technologies</td>\n",
       "      <td>Smart Device and Mobile App to Induce Sleep</td>\n",
       "      <td>B2C</td>\n",
       "      <td>1-10</td>\n",
       "      <td>Pre-Funding</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[monitoring, digital-healthcare, sleep-disorde...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.03323083370923996, -0.006272533442825079, ...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2TeaM</td>\n",
       "      <td>Software Solutions for Financial Companies</td>\n",
       "      <td>B2B, B2B2C</td>\n",
       "      <td>11-50</td>\n",
       "      <td>Mature</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>[marketing, insurance-companies, bank-infrastr...</td>\n",
       "      <td>Israel</td>\n",
       "      <td>https://finder.startupnationcentral.org/</td>\n",
       "      <td>[-0.0050485446117818356, 0.030337687581777573,...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Digipal</td>\n",
       "      <td>Digipal is a digital consultancy based in Tbil...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.017287444323301315, 0.06208805367350578, -0...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://www.digipal.agency/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>BeatBind</td>\n",
       "      <td>BeatBind is the industry's long overdue platfo...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[social, leisure]</td>\n",
       "      <td>[-0.00438214186578989, -0.051213208585977554, ...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://beatbind.io/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Smart Academy</td>\n",
       "      <td>Smart Academy is a modern educational institut...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[edtech]</td>\n",
       "      <td>[0.0005468669114634395, -0.05331585183739662, ...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://smartacademy.ge/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>MaxinAI</td>\n",
       "      <td>MaxinAI isglobal AI development company that w...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.021948501467704773, 0.024166792631149292, -...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://www.maxinai.com/#all-industries</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>TLANCER</td>\n",
       "      <td>Tlancer aims to create an unlimited educationa...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[edtech]</td>\n",
       "      <td>[0.02025573141872883, -0.022812215611338615, -...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://www.tlancer.ge/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>MyCoins.ge</td>\n",
       "      <td>MyCoins.ge is the biggest Crypto exchange plat...</td>\n",
       "      <td>georgia</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[fintech]</td>\n",
       "      <td>[0.0306679829955101, -0.010290002450346947, -0...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://www.mycoins.ge/index.php/main/welcome</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>ATL Tech</td>\n",
       "      <td>ATL Tech is a company that specialized in Info...</td>\n",
       "      <td>azerbaijan</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.014148630201816559, -0.01890609972178936, -...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://www.atltech.az/az</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>zypl.ai</td>\n",
       "      <td>zypl.ai’s strategy is to become the leading AI...</td>\n",
       "      <td>tajikistan</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.001473484211601317, 0.008834785781800747, -...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://zypl.ai/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>botifi</td>\n",
       "      <td>botifi is a tool for a quick start of sales on...</td>\n",
       "      <td>uzbekistan</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.017161941155791283, -0.015285761095583439, ...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://botifi.me/en/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>smartup</td>\n",
       "      <td>smartup develop software solutions for various...</td>\n",
       "      <td>uzbekistan</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>pre-seed</td>\n",
       "      <td>11-500+</td>\n",
       "      <td>[software, data]</td>\n",
       "      <td>[0.00023191649233922362, -0.005923444870859384...</td>\n",
       "      <td>https://www.startupblink.com</td>\n",
       "      <td>Undisclosed</td>\n",
       "      <td>https://smartup.uz/en.html</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     name                                        description  \\\n",
       "0        0.10 of a Second           Smart Indicators for Connected Vehicles    \n",
       "1                  12trix                        Math Lessons for Young Kids   \n",
       "2         1E Therapeutics                          Novel RNA-targeting Drugs   \n",
       "3              1MRobotics  Retail Automation Solutions with Nano Fulfillment   \n",
       "4               1touch.io    Personal Data Flow Tracking and Data Cataloging   \n",
       "5                    24me                 Next-generation Personal Assistant   \n",
       "6             270Surgical  Specialty Laparoscopic System for Wide Cavity ...   \n",
       "7               2bPrecise                        Precision Medicine Solution   \n",
       "8   2breathe Technologies        Smart Device and Mobile App to Induce Sleep   \n",
       "9                   2TeaM         Software Solutions for Financial Companies   \n",
       "10               Digipal   Digipal is a digital consultancy based in Tbil...   \n",
       "11               BeatBind  BeatBind is the industry's long overdue platfo...   \n",
       "12          Smart Academy  Smart Academy is a modern educational institut...   \n",
       "13                MaxinAI  MaxinAI isglobal AI development company that w...   \n",
       "14                TLANCER  Tlancer aims to create an unlimited educationa...   \n",
       "15             MyCoins.ge  MyCoins.ge is the biggest Crypto exchange plat...   \n",
       "16               ATL Tech  ATL Tech is a company that specialized in Info...   \n",
       "17                zypl.ai  zypl.ai’s strategy is to become the leading AI...   \n",
       "18                 botifi  botifi is a tool for a quick start of sales on...   \n",
       "19                smartup  smartup develop software solutions for various...   \n",
       "\n",
       "        target         size        stage       raised  \\\n",
       "0          B2B         1-10  Pre-Funding  Undisclosed   \n",
       "1     B2B, B2C         1-10  Pre-Funding  Undisclosed   \n",
       "2          B2B       51-200         Seed        $120M   \n",
       "3          B2B        11-50            A         $25M   \n",
       "4          B2B       51-200            A       $16.1M   \n",
       "5          B2C         1-10         Seed  Undisclosed   \n",
       "6     B2B, B2C        11-50  Pre-Funding  Undisclosed   \n",
       "7          B2B       51-200       Mature  Undisclosed   \n",
       "8          B2C         1-10  Pre-Funding  Undisclosed   \n",
       "9   B2B, B2B2C        11-50       Mature  Undisclosed   \n",
       "10     georgia  Undisclosed     pre-seed      11-500+   \n",
       "11     georgia  Undisclosed     pre-seed      11-500+   \n",
       "12     georgia  Undisclosed     pre-seed      11-500+   \n",
       "13     georgia  Undisclosed     pre-seed      11-500+   \n",
       "14     georgia  Undisclosed     pre-seed      11-500+   \n",
       "15     georgia  Undisclosed     pre-seed      11-500+   \n",
       "16  azerbaijan  Undisclosed     pre-seed      11-500+   \n",
       "17  tajikistan  Undisclosed     pre-seed      11-500+   \n",
       "18  uzbekistan  Undisclosed     pre-seed      11-500+   \n",
       "19  uzbekistan  Undisclosed     pre-seed      11-500+   \n",
       "\n",
       "                                                 tags  \\\n",
       "0   [connected-vehicles, adas, autonomous-vehicles...   \n",
       "1   [sdg, schools, pre-k, serious-games, games, mo...   \n",
       "2   [pharmaceuticals, chronic-disease, immunology,...   \n",
       "3   [omni-channel, ecommerce, climate-tech, artifi...   \n",
       "4   [enterprise-solutions, data-protection, cyber-...   \n",
       "5   [time-management, scheduling, calendars, artif...   \n",
       "6   [endoscopy, surgery, operating-rooms, optics, ...   \n",
       "7   [decision-making, predictive-analytics, cardio...   \n",
       "8   [monitoring, digital-healthcare, sleep-disorde...   \n",
       "9   [marketing, insurance-companies, bank-infrastr...   \n",
       "10                                   [software, data]   \n",
       "11                                  [social, leisure]   \n",
       "12                                           [edtech]   \n",
       "13                                   [software, data]   \n",
       "14                                           [edtech]   \n",
       "15                                          [fintech]   \n",
       "16                                   [software, data]   \n",
       "17                                   [software, data]   \n",
       "18                                   [software, data]   \n",
       "19                                   [software, data]   \n",
       "\n",
       "                                              country  \\\n",
       "0                                              Israel   \n",
       "1                                              Israel   \n",
       "2                                              Israel   \n",
       "3                                              Israel   \n",
       "4                                              Israel   \n",
       "5                                              Israel   \n",
       "6                                              Israel   \n",
       "7                                              Israel   \n",
       "8                                              Israel   \n",
       "9                                              Israel   \n",
       "10  [0.017287444323301315, 0.06208805367350578, -0...   \n",
       "11  [-0.00438214186578989, -0.051213208585977554, ...   \n",
       "12  [0.0005468669114634395, -0.05331585183739662, ...   \n",
       "13  [0.021948501467704773, 0.024166792631149292, -...   \n",
       "14  [0.02025573141872883, -0.022812215611338615, -...   \n",
       "15  [0.0306679829955101, -0.010290002450346947, -0...   \n",
       "16  [0.014148630201816559, -0.01890609972178936, -...   \n",
       "17  [0.001473484211601317, 0.008834785781800747, -...   \n",
       "18  [0.017161941155791283, -0.015285761095583439, ...   \n",
       "19  [0.00023191649233922362, -0.005923444870859384...   \n",
       "\n",
       "                                      source  \\\n",
       "0   https://finder.startupnationcentral.org/   \n",
       "1   https://finder.startupnationcentral.org/   \n",
       "2   https://finder.startupnationcentral.org/   \n",
       "3   https://finder.startupnationcentral.org/   \n",
       "4   https://finder.startupnationcentral.org/   \n",
       "5   https://finder.startupnationcentral.org/   \n",
       "6   https://finder.startupnationcentral.org/   \n",
       "7   https://finder.startupnationcentral.org/   \n",
       "8   https://finder.startupnationcentral.org/   \n",
       "9   https://finder.startupnationcentral.org/   \n",
       "10              https://www.startupblink.com   \n",
       "11              https://www.startupblink.com   \n",
       "12              https://www.startupblink.com   \n",
       "13              https://www.startupblink.com   \n",
       "14              https://www.startupblink.com   \n",
       "15              https://www.startupblink.com   \n",
       "16              https://www.startupblink.com   \n",
       "17              https://www.startupblink.com   \n",
       "18              https://www.startupblink.com   \n",
       "19              https://www.startupblink.com   \n",
       "\n",
       "                                         text_vector_  \\\n",
       "0   [-0.031224824488162994, -0.06342269480228424, ...   \n",
       "1   [-0.038649097084999084, 0.028091922402381897, ...   \n",
       "2   [0.04561534896492958, -0.017776092514395714, 0...   \n",
       "3   [0.0024080690927803516, -0.03042100928723812, ...   \n",
       "4   [-0.01007091999053955, 0.10431888699531555, -0...   \n",
       "5   [0.035849399864673615, 0.04990792274475098, -0...   \n",
       "6   [-0.00110541470348835, 0.011574415490031242, 0...   \n",
       "7   [0.01863308809697628, 0.03877090662717819, -0....   \n",
       "8   [-0.03323083370923996, -0.006272533442825079, ...   \n",
       "9   [-0.0050485446117818356, 0.030337687581777573,...   \n",
       "10                                        Undisclosed   \n",
       "11                                        Undisclosed   \n",
       "12                                        Undisclosed   \n",
       "13                                        Undisclosed   \n",
       "14                                        Undisclosed   \n",
       "15                                        Undisclosed   \n",
       "16                                        Undisclosed   \n",
       "17                                        Undisclosed   \n",
       "18                                        Undisclosed   \n",
       "19                                        Undisclosed   \n",
       "\n",
       "                                              url  \n",
       "0                                                  \n",
       "1                                                  \n",
       "2                                                  \n",
       "3                                                  \n",
       "4                                                  \n",
       "5                                                  \n",
       "6                                                  \n",
       "7                                                  \n",
       "8                                                  \n",
       "9                                                  \n",
       "10                    https://www.digipal.agency/  \n",
       "11                           https://beatbind.io/  \n",
       "12                       https://smartacademy.ge/  \n",
       "13        https://www.maxinai.com/#all-industries  \n",
       "14                        https://www.tlancer.ge/  \n",
       "15  https://www.mycoins.ge/index.php/main/welcome  \n",
       "16                      https://www.atltech.az/az  \n",
       "17                               https://zypl.ai/  \n",
       "18                          https://botifi.me/en/  \n",
       "19                     https://smartup.uz/en.html  "
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.merge(df1, df2, on='A', how='outer')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.system('pip install openpyxl')\n",
    "os.system('pip install sentence-transformers')\n",
    "import pandas as pd\n",
    "import gradio as gr\n",
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
    "\n",
    "df = pd.read_parquet('df_encoded.parquet')\n",
    "df['tags'] = df['tags'].apply(lambda x : str(x))\n",
    "def parse_raised(x):\n",
    "    if x == 'Undisclosed':\n",
    "        return 0\n",
    "    else: \n",
    "        quantifier = x[-1]\n",
    "        x = float(x[1:-1])\n",
    "        if quantifier == 'K':\n",
    "            return x/1000\n",
    "        elif quantifier == 'M':\n",
    "            return x\n",
    "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n",
    "df = df.reset_index(drop=True)\n",
    "\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
      "  warnings.warn(\n",
      "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Slider, please remove them: {'step_size': 1}\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7896\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7896/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a\n"
     ]
    }
   ],
   "source": [
    "def filter_df(df, column_name, filter_type, filter_value):\n",
    "    if filter_type == '==':\n",
    "        df_filtered = df[df[column_name]==filter_value]\n",
    "    elif filter_type == '>=':\n",
    "        df_filtered = df[df[column_name]>=filter_value]\n",
    "    elif filter_type == '<=':\n",
    "        df_filtered = df[df[column_name]<=filter_value]\n",
    "    elif filter_type == 'contains':\n",
    "        df_filtered = df[df['target'].str.contains(filter_value)]\n",
    "    return df_filtered\n",
    "\n",
    "def search(df, query):\n",
    "    product = model.encode(query).tolist()\n",
    "    # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
    "\n",
    "    #prepare model\n",
    "    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
    "\n",
    "    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
    "\n",
    "    #print out the description of every recommended product\n",
    "    return df.iloc[list(indices)[0]][['name', 'description', 'raised', 'year', 'target', 'size', 'stage', 'tags']]\n",
    "\n",
    "#the first module becomes text1, the second module file1\n",
    "def greet(size, target, raised, query): \n",
    "    df_size = filter_df(df, 'size', '==', size)\n",
    "    df_target = filter_df(df_size, 'target', 'contains', target)\n",
    "    def raised_zero(x):\n",
    "        if x == 0:\n",
    "            return 'Undisclosed'\n",
    "        else:\n",
    "            return x\n",
    "    print('a')\n",
    "    df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]\n",
    "    df_knn = search(df_raised, query)\n",
    "    #we live the sorting for last\n",
    "    df_knn = df_knn.sort_values('raised', ascending=False)\n",
    "    df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))\n",
    "\n",
    "    return df_knn\n",
    "\n",
    "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
    "    gr.Markdown(\n",
    "    \"\"\"\n",
    "    # Gradio with History\n",
    "    \"\"\"\n",
    "    )\n",
    "    size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+'], multiselect=False, value='11-50', label='size')\n",
    "    target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], value='B2B', multiselect=False, label='target')\n",
    "    raised = gr.Slider(0, 20, value=5, step_size=1, label=\"Minimum raising (in Millions)\")\n",
    "    query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')\n",
    "    btn = gr.Button(value=\"Search for a Startup\")\n",
    "    output1 = gr.DataFrame(label='value')\n",
    "    # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
    "    btn.click(greet, [size, target, raised, query], [output1])\n",
    "demo.launch(share=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}