{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'data_path' from 'utils' (/Users/yonglinwu/dev/image-search-playground/utils.py)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[28], line 9\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[1;32m      7\u001b[0m torch\u001b[39m.\u001b[39mset_printoptions(precision\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m)\n\u001b[0;32m----> 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m get_image_embeddings, model_name_to_ids, load_models, model_dict, data_path\n\u001b[1;32m     11\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mwarnings\u001b[39;00m\n\u001b[1;32m     12\u001b[0m warnings\u001b[39m.\u001b[39msimplefilter(action\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mignore\u001b[39m\u001b[39m'\u001b[39m, category\u001b[39m=\u001b[39m\u001b[39mFutureWarning\u001b[39;00m)\n",
      "\u001b[0;31mImportError\u001b[0m: cannot import name 'data_path' from 'utils' (/Users/yonglinwu/dev/image-search-playground/utils.py)"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer, util\n",
    "from PIL import Image\n",
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "import torch\n",
    "torch.set_printoptions(precision=10)\n",
    "\n",
    "from utils import get_image_embeddings, model_name_to_ids, load_models, model_dict, data_path\n",
    "\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "patagonia_df = pd.read_csv(data_path + 'metadata/patagonia_losGatos.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>brand</th>\n",
       "      <th>title</th>\n",
       "      <th>product_url</th>\n",
       "      <th>price</th>\n",
       "      <th>description</th>\n",
       "      <th>size</th>\n",
       "      <th>category</th>\n",
       "      <th>colors</th>\n",
       "      <th>Poshmark</th>\n",
       "      <th>Unnamed: 9</th>\n",
       "      <th>...</th>\n",
       "      <th>Unnamed: 38</th>\n",
       "      <th>Unnamed: 39</th>\n",
       "      <th>Unnamed: 40</th>\n",
       "      <th>Unnamed: 41</th>\n",
       "      <th>Unnamed: 42</th>\n",
       "      <th>Unnamed: 43</th>\n",
       "      <th>Unnamed: 44</th>\n",
       "      <th>Unnamed: 45</th>\n",
       "      <th>Unnamed: 46</th>\n",
       "      <th>Unnamed: 47</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Patagonia</td>\n",
       "      <td>Patagonia Women's Los Gatos Fleece 1/4-Zip Smo...</td>\n",
       "      <td>https://poshmark.com/listing/63d4821f2fbf1afe8...</td>\n",
       "      <td>$36.00</td>\n",
       "      <td>A soft, warm and versatile quarter-zip pullove...</td>\n",
       "      <td>M</td>\n",
       "      <td>Tops</td>\n",
       "      <td>[{'name': 'Gray', 'rgb': '#929292', 'message_i...</td>\n",
       "      <td>Poshmark</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Patagonia</td>\n",
       "      <td>Patagonia Los Gatos 1/4 Zip Pullover M Beech B...</td>\n",
       "      <td>https://poshmark.com/listing/63fcd7709f212bd48...</td>\n",
       "      <td>$59.00</td>\n",
       "      <td>High pile, quarter zip pulllover\\nMeasurements...</td>\n",
       "      <td>M</td>\n",
       "      <td>Tops</td>\n",
       "      <td>[{'name': 'Brown', 'rgb': '#663509', 'message_...</td>\n",
       "      <td>Poshmark</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Patagonia</td>\n",
       "      <td>PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...</td>\n",
       "      <td>https://poshmark.com/listing/642b9bbcfed51f812...</td>\n",
       "      <td>$59.00</td>\n",
       "      <td>PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...</td>\n",
       "      <td>S</td>\n",
       "      <td>Tops</td>\n",
       "      <td>[{'name': 'White', 'rgb': '#FFFFFF', 'message_...</td>\n",
       "      <td>Poshmark</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Patagonia</td>\n",
       "      <td>Girl’s Patagonia Los Gatos Fleece 1/4 Zip XS</td>\n",
       "      <td>https://poshmark.com/listing/63f4f459c5df6c7f8...</td>\n",
       "      <td>$30.00</td>\n",
       "      <td>Girl’s Patagonia Los Gatos 1/4 Zip Fleece\\n\\n-...</td>\n",
       "      <td>XSG</td>\n",
       "      <td>Other</td>\n",
       "      <td>[{'name': 'Tan', 'rgb': '#d1b48e', 'message_id...</td>\n",
       "      <td>Poshmark</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Patagonia</td>\n",
       "      <td>Patagonia Los Gatos Quarter Zip Grey</td>\n",
       "      <td>https://poshmark.com/listing/622cc43d3a0db900b...</td>\n",
       "      <td>$59.00</td>\n",
       "      <td>Patagonia Los Gatos Quarter Zip Grey \\nWomen’s...</td>\n",
       "      <td>M</td>\n",
       "      <td>Tops</td>\n",
       "      <td>[{'name': 'Gray', 'rgb': '#929292', 'message_i...</td>\n",
       "      <td>Poshmark</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       brand                                              title  \\\n",
       "0  Patagonia  Patagonia Women's Los Gatos Fleece 1/4-Zip Smo...   \n",
       "1  Patagonia  Patagonia Los Gatos 1/4 Zip Pullover M Beech B...   \n",
       "2  Patagonia  PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...   \n",
       "3  Patagonia       Girl’s Patagonia Los Gatos Fleece 1/4 Zip XS   \n",
       "4  Patagonia               Patagonia Los Gatos Quarter Zip Grey   \n",
       "\n",
       "                                         product_url   price  \\\n",
       "0  https://poshmark.com/listing/63d4821f2fbf1afe8...  $36.00   \n",
       "1  https://poshmark.com/listing/63fcd7709f212bd48...  $59.00   \n",
       "2  https://poshmark.com/listing/642b9bbcfed51f812...  $59.00   \n",
       "3  https://poshmark.com/listing/63f4f459c5df6c7f8...  $30.00   \n",
       "4  https://poshmark.com/listing/622cc43d3a0db900b...  $59.00   \n",
       "\n",
       "                                         description size category  \\\n",
       "0  A soft, warm and versatile quarter-zip pullove...    M     Tops   \n",
       "1  High pile, quarter zip pulllover\\nMeasurements...    M     Tops   \n",
       "2  PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...    S     Tops   \n",
       "3  Girl’s Patagonia Los Gatos 1/4 Zip Fleece\\n\\n-...  XSG    Other   \n",
       "4  Patagonia Los Gatos Quarter Zip Grey \\nWomen’s...    M     Tops   \n",
       "\n",
       "                                              colors  Poshmark  Unnamed: 9  \\\n",
       "0  [{'name': 'Gray', 'rgb': '#929292', 'message_i...  Poshmark       False   \n",
       "1  [{'name': 'Brown', 'rgb': '#663509', 'message_...  Poshmark       False   \n",
       "2  [{'name': 'White', 'rgb': '#FFFFFF', 'message_...  Poshmark       False   \n",
       "3  [{'name': 'Tan', 'rgb': '#d1b48e', 'message_id...  Poshmark       False   \n",
       "4  [{'name': 'Gray', 'rgb': '#929292', 'message_i...  Poshmark       False   \n",
       "\n",
       "   ...  Unnamed: 38 Unnamed: 39  Unnamed: 40  Unnamed: 41  Unnamed: 42  \\\n",
       "0  ...          NaN         NaN          NaN          NaN          NaN   \n",
       "1  ...          NaN         NaN          NaN          NaN          NaN   \n",
       "2  ...          NaN         NaN          NaN          NaN          NaN   \n",
       "3  ...          NaN         NaN          NaN          NaN          NaN   \n",
       "4  ...          NaN         NaN          NaN          NaN          NaN   \n",
       "\n",
       "   Unnamed: 43  Unnamed: 44  Unnamed: 45  Unnamed: 46  Unnamed: 47  \n",
       "0          NaN          NaN          NaN          NaN          NaN  \n",
       "1          NaN          NaN          NaN          NaN          NaN  \n",
       "2          NaN          NaN          NaN          NaN          NaN  \n",
       "3          NaN          NaN          NaN          NaN          NaN  \n",
       "4          NaN          NaN          NaN          NaN          NaN  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patagonia_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#download_images(patagonia_df, data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "load_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def generate_embeddings():\n",
    "    embeddings_df = pd.DataFrame()\n",
    "\n",
    "    # Get image embeddings\n",
    "    with torch.no_grad():\n",
    "        for fp in os.listdir(data_path + 'images/'):\n",
    "            if fp.endswith('.jpg'):\n",
    "                new_row = {'name': fp}\n",
    "                for model_name in model_name_to_ids.keys():\n",
    "                    new_row[f'{model_name}-embedding'] = get_image_embeddings(model_name, Image.open(data_path + 'images/' + fp))\n",
    "                embeddings_df = embeddings_df.append(new_row, ignore_index=True)\n",
    "    return embeddings_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "fp = os.listdir(data_path + 'images/')[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_name = 'fashion'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "new_row = {'name': fp, f'{model_name}-embedding': get_image_embeddings(model_name, Image.open(data_path + 'images/' + fp))}\n",
    "                "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "embeddings_df = generate_embeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>sentence-transformer-clip-ViT-L-14-embedding</th>\n",
       "      <th>fashion-embedding</th>\n",
       "      <th>openai-clip-embedding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Women's Under Armour Hustle Fleece Hoodie pull...</td>\n",
       "      <td>[1.0734258, 0.99022365, 0.32032806, 0.2895219,...</td>\n",
       "      <td>[0.23177437, -1.9268938, 0.273342, -0.02474568...</td>\n",
       "      <td>[-0.32902592, -0.09434131, 0.3055967, 0.229937...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Patagonia Los Gatos Fleece Grey Pullover.jpg</td>\n",
       "      <td>[0.6227796, 0.026531212, 0.45240527, -0.488214...</td>\n",
       "      <td>[0.38133767, -1.3040155, 1.1697398, -0.3085520...</td>\n",
       "      <td>[-0.1695469, 0.5067289, 0.31120676, -0.0083701...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>REI Women's Down With It Quilted Hooded Parka ...</td>\n",
       "      <td>[0.8497103, 1.2925782, -0.21685322, 0.24116844...</td>\n",
       "      <td>[-0.30043703, -1.3144073, -0.33848628, 0.24008...</td>\n",
       "      <td>[-0.24841668, 0.4876942, 0.39810008, -0.141552...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chanel Haute Couture Navy Blue Dress Semi Shee...</td>\n",
       "      <td>[0.536018, 0.60787296, -0.2751825, 1.0325747, ...</td>\n",
       "      <td>[-0.101031125, 0.033914, -0.44531134, -0.64656...</td>\n",
       "      <td>[-0.08328074, 0.19443086, 0.14361368, 0.259305...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Patagonia Women’s S Los Gatos Quarter-Zip Flee...</td>\n",
       "      <td>[0.79398394, 1.3899276, -0.21383175, 0.0109823...</td>\n",
       "      <td>[0.60070944, -1.1051046, 1.0572466, 0.47092092...</td>\n",
       "      <td>[-0.27894062, -0.09589732, 0.5556799, -0.13458...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>326</th>\n",
       "      <td>Women's REI Elements Jacket Size M.jpg</td>\n",
       "      <td>[0.6310029, 0.9942212, 0.009293936, 0.7862729,...</td>\n",
       "      <td>[0.19858713, -1.8665266, -0.3323754, 0.0465058...</td>\n",
       "      <td>[-0.0952643, 0.8016211, 0.08129032, 0.15187423...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>327</th>\n",
       "      <td>CHANEL Black cotton bodycon tank dress with zi...</td>\n",
       "      <td>[1.0761135, 0.18927886, -0.007131472, 0.625682...</td>\n",
       "      <td>[0.07516122, -0.1886161, 0.1334078, -0.2829321...</td>\n",
       "      <td>[-0.12297699, 0.026368856, 0.04415588, 0.26031...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>328</th>\n",
       "      <td>Reformation X Veda Women's Bad Leather Jacket ...</td>\n",
       "      <td>[0.79690784, 1.2895226, 0.22802149, -0.2736021...</td>\n",
       "      <td>[-0.12224964, -0.38734418, 0.35824925, 0.95855...</td>\n",
       "      <td>[0.6507246, 0.27751687, 0.36114892, -0.0831387...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>329</th>\n",
       "      <td>DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...</td>\n",
       "      <td>[1.1617887, 0.19193622, 0.046035454, 0.4334900...</td>\n",
       "      <td>[-0.20762922, 0.1754938, -0.7334341, -0.106492...</td>\n",
       "      <td>[-0.31946087, 0.19534132, 0.37351555, -0.09741...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>PATAGONIA Nano Puff Jacket Zip Primaloft Insul...</td>\n",
       "      <td>[0.2912089, 0.72192264, -0.01620815, 0.0022971...</td>\n",
       "      <td>[0.0026952028, -1.6660439, 0.03839147, -0.2164...</td>\n",
       "      <td>[0.12799336, 0.75828236, 0.10943861, -0.036647...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>331 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  name  \\\n",
       "0    Women's Under Armour Hustle Fleece Hoodie pull...   \n",
       "1         Patagonia Los Gatos Fleece Grey Pullover.jpg   \n",
       "2    REI Women's Down With It Quilted Hooded Parka ...   \n",
       "3    Chanel Haute Couture Navy Blue Dress Semi Shee...   \n",
       "4    Patagonia Women’s S Los Gatos Quarter-Zip Flee...   \n",
       "..                                                 ...   \n",
       "326             Women's REI Elements Jacket Size M.jpg   \n",
       "327  CHANEL Black cotton bodycon tank dress with zi...   \n",
       "328  Reformation X Veda Women's Bad Leather Jacket ...   \n",
       "329  DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...   \n",
       "330  PATAGONIA Nano Puff Jacket Zip Primaloft Insul...   \n",
       "\n",
       "          sentence-transformer-clip-ViT-L-14-embedding  \\\n",
       "0    [1.0734258, 0.99022365, 0.32032806, 0.2895219,...   \n",
       "1    [0.6227796, 0.026531212, 0.45240527, -0.488214...   \n",
       "2    [0.8497103, 1.2925782, -0.21685322, 0.24116844...   \n",
       "3    [0.536018, 0.60787296, -0.2751825, 1.0325747, ...   \n",
       "4    [0.79398394, 1.3899276, -0.21383175, 0.0109823...   \n",
       "..                                                 ...   \n",
       "326  [0.6310029, 0.9942212, 0.009293936, 0.7862729,...   \n",
       "327  [1.0761135, 0.18927886, -0.007131472, 0.625682...   \n",
       "328  [0.79690784, 1.2895226, 0.22802149, -0.2736021...   \n",
       "329  [1.1617887, 0.19193622, 0.046035454, 0.4334900...   \n",
       "330  [0.2912089, 0.72192264, -0.01620815, 0.0022971...   \n",
       "\n",
       "                                     fashion-embedding  \\\n",
       "0    [0.23177437, -1.9268938, 0.273342, -0.02474568...   \n",
       "1    [0.38133767, -1.3040155, 1.1697398, -0.3085520...   \n",
       "2    [-0.30043703, -1.3144073, -0.33848628, 0.24008...   \n",
       "3    [-0.101031125, 0.033914, -0.44531134, -0.64656...   \n",
       "4    [0.60070944, -1.1051046, 1.0572466, 0.47092092...   \n",
       "..                                                 ...   \n",
       "326  [0.19858713, -1.8665266, -0.3323754, 0.0465058...   \n",
       "327  [0.07516122, -0.1886161, 0.1334078, -0.2829321...   \n",
       "328  [-0.12224964, -0.38734418, 0.35824925, 0.95855...   \n",
       "329  [-0.20762922, 0.1754938, -0.7334341, -0.106492...   \n",
       "330  [0.0026952028, -1.6660439, 0.03839147, -0.2164...   \n",
       "\n",
       "                                 openai-clip-embedding  \n",
       "0    [-0.32902592, -0.09434131, 0.3055967, 0.229937...  \n",
       "1    [-0.1695469, 0.5067289, 0.31120676, -0.0083701...  \n",
       "2    [-0.24841668, 0.4876942, 0.39810008, -0.141552...  \n",
       "3    [-0.08328074, 0.19443086, 0.14361368, 0.259305...  \n",
       "4    [-0.27894062, -0.09589732, 0.5556799, -0.13458...  \n",
       "..                                                 ...  \n",
       "326  [-0.0952643, 0.8016211, 0.08129032, 0.15187423...  \n",
       "327  [-0.12297699, 0.026368856, 0.04415588, 0.26031...  \n",
       "328  [0.6507246, 0.27751687, 0.36114892, -0.0831387...  \n",
       "329  [-0.31946087, 0.19534132, 0.37351555, -0.09741...  \n",
       "330  [0.12799336, 0.75828236, 0.10943861, -0.036647...  \n",
       "\n",
       "[331 rows x 4 columns]"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeddings_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "embeddings_path = os.path.join(data_path, 'metadata/patagonia_losGatos_embeddings.pq')\n",
    "embeddings_df.to_parquet(embeddings_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "embeddings_df = pd.read_parquet(embeddings_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for i, row in embeddings_df.iterrows():\n",
    "    if '\\n' in row['name']:\n",
    "        print(row['name'])\n",
    "        embeddings_df = embeddings_df.drop(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>sentence-transformer-clip-ViT-L-14-embedding</th>\n",
       "      <th>fashion-embedding</th>\n",
       "      <th>openai-clip-embedding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Women's Under Armour Hustle Fleece Hoodie pull...</td>\n",
       "      <td>[1.0734258, 0.99022365, 0.32032806, 0.2895219,...</td>\n",
       "      <td>[0.23177437, -1.9268938, 0.273342, -0.02474568...</td>\n",
       "      <td>[-0.32902592, -0.09434131, 0.3055967, 0.229937...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Patagonia Los Gatos Fleece Grey Pullover.jpg</td>\n",
       "      <td>[0.6227796, 0.026531212, 0.45240527, -0.488214...</td>\n",
       "      <td>[0.38133767, -1.3040155, 1.1697398, -0.3085520...</td>\n",
       "      <td>[-0.1695469, 0.5067289, 0.31120676, -0.0083701...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>REI Women's Down With It Quilted Hooded Parka ...</td>\n",
       "      <td>[0.8497103, 1.2925782, -0.21685322, 0.24116844...</td>\n",
       "      <td>[-0.30043703, -1.3144073, -0.33848628, 0.24008...</td>\n",
       "      <td>[-0.24841668, 0.4876942, 0.39810008, -0.141552...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chanel Haute Couture Navy Blue Dress Semi Shee...</td>\n",
       "      <td>[0.536018, 0.60787296, -0.2751825, 1.0325747, ...</td>\n",
       "      <td>[-0.101031125, 0.033914, -0.44531134, -0.64656...</td>\n",
       "      <td>[-0.08328074, 0.19443086, 0.14361368, 0.259305...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Patagonia Women’s S Los Gatos Quarter-Zip Flee...</td>\n",
       "      <td>[0.79398394, 1.3899276, -0.21383175, 0.0109823...</td>\n",
       "      <td>[0.60070944, -1.1051046, 1.0572466, 0.47092092...</td>\n",
       "      <td>[-0.27894062, -0.09589732, 0.5556799, -0.13458...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>326</th>\n",
       "      <td>Women's REI Elements Jacket Size M.jpg</td>\n",
       "      <td>[0.6310029, 0.9942212, 0.009293936, 0.7862729,...</td>\n",
       "      <td>[0.19858713, -1.8665266, -0.3323754, 0.0465058...</td>\n",
       "      <td>[-0.0952643, 0.8016211, 0.08129032, 0.15187423...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>327</th>\n",
       "      <td>CHANEL Black cotton bodycon tank dress with zi...</td>\n",
       "      <td>[1.0761135, 0.18927886, -0.007131472, 0.625682...</td>\n",
       "      <td>[0.07516122, -0.1886161, 0.1334078, -0.2829321...</td>\n",
       "      <td>[-0.12297699, 0.026368856, 0.04415588, 0.26031...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>328</th>\n",
       "      <td>Reformation X Veda Women's Bad Leather Jacket ...</td>\n",
       "      <td>[0.79690784, 1.2895226, 0.22802149, -0.2736021...</td>\n",
       "      <td>[-0.12224964, -0.38734418, 0.35824925, 0.95855...</td>\n",
       "      <td>[0.6507246, 0.27751687, 0.36114892, -0.0831387...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>329</th>\n",
       "      <td>DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...</td>\n",
       "      <td>[1.1617887, 0.19193622, 0.046035454, 0.4334900...</td>\n",
       "      <td>[-0.20762922, 0.1754938, -0.7334341, -0.106492...</td>\n",
       "      <td>[-0.31946087, 0.19534132, 0.37351555, -0.09741...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>PATAGONIA Nano Puff Jacket Zip Primaloft Insul...</td>\n",
       "      <td>[0.2912089, 0.72192264, -0.01620815, 0.0022971...</td>\n",
       "      <td>[0.0026952028, -1.6660439, 0.03839147, -0.2164...</td>\n",
       "      <td>[0.12799336, 0.75828236, 0.10943861, -0.036647...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>331 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  name  \\\n",
       "0    Women's Under Armour Hustle Fleece Hoodie pull...   \n",
       "1         Patagonia Los Gatos Fleece Grey Pullover.jpg   \n",
       "2    REI Women's Down With It Quilted Hooded Parka ...   \n",
       "3    Chanel Haute Couture Navy Blue Dress Semi Shee...   \n",
       "4    Patagonia Women’s S Los Gatos Quarter-Zip Flee...   \n",
       "..                                                 ...   \n",
       "326             Women's REI Elements Jacket Size M.jpg   \n",
       "327  CHANEL Black cotton bodycon tank dress with zi...   \n",
       "328  Reformation X Veda Women's Bad Leather Jacket ...   \n",
       "329  DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...   \n",
       "330  PATAGONIA Nano Puff Jacket Zip Primaloft Insul...   \n",
       "\n",
       "          sentence-transformer-clip-ViT-L-14-embedding  \\\n",
       "0    [1.0734258, 0.99022365, 0.32032806, 0.2895219,...   \n",
       "1    [0.6227796, 0.026531212, 0.45240527, -0.488214...   \n",
       "2    [0.8497103, 1.2925782, -0.21685322, 0.24116844...   \n",
       "3    [0.536018, 0.60787296, -0.2751825, 1.0325747, ...   \n",
       "4    [0.79398394, 1.3899276, -0.21383175, 0.0109823...   \n",
       "..                                                 ...   \n",
       "326  [0.6310029, 0.9942212, 0.009293936, 0.7862729,...   \n",
       "327  [1.0761135, 0.18927886, -0.007131472, 0.625682...   \n",
       "328  [0.79690784, 1.2895226, 0.22802149, -0.2736021...   \n",
       "329  [1.1617887, 0.19193622, 0.046035454, 0.4334900...   \n",
       "330  [0.2912089, 0.72192264, -0.01620815, 0.0022971...   \n",
       "\n",
       "                                     fashion-embedding  \\\n",
       "0    [0.23177437, -1.9268938, 0.273342, -0.02474568...   \n",
       "1    [0.38133767, -1.3040155, 1.1697398, -0.3085520...   \n",
       "2    [-0.30043703, -1.3144073, -0.33848628, 0.24008...   \n",
       "3    [-0.101031125, 0.033914, -0.44531134, -0.64656...   \n",
       "4    [0.60070944, -1.1051046, 1.0572466, 0.47092092...   \n",
       "..                                                 ...   \n",
       "326  [0.19858713, -1.8665266, -0.3323754, 0.0465058...   \n",
       "327  [0.07516122, -0.1886161, 0.1334078, -0.2829321...   \n",
       "328  [-0.12224964, -0.38734418, 0.35824925, 0.95855...   \n",
       "329  [-0.20762922, 0.1754938, -0.7334341, -0.106492...   \n",
       "330  [0.0026952028, -1.6660439, 0.03839147, -0.2164...   \n",
       "\n",
       "                                 openai-clip-embedding  \n",
       "0    [-0.32902592, -0.09434131, 0.3055967, 0.229937...  \n",
       "1    [-0.1695469, 0.5067289, 0.31120676, -0.0083701...  \n",
       "2    [-0.24841668, 0.4876942, 0.39810008, -0.141552...  \n",
       "3    [-0.08328074, 0.19443086, 0.14361368, 0.259305...  \n",
       "4    [-0.27894062, -0.09589732, 0.5556799, -0.13458...  \n",
       "..                                                 ...  \n",
       "326  [-0.0952643, 0.8016211, 0.08129032, 0.15187423...  \n",
       "327  [-0.12297699, 0.026368856, 0.04415588, 0.26031...  \n",
       "328  [0.6507246, 0.27751687, 0.36114892, -0.0831387...  \n",
       "329  [-0.31946087, 0.19534132, 0.37351555, -0.09741...  \n",
       "330  [0.12799336, 0.75828236, 0.10943861, -0.036647...  \n",
       "\n",
       "[331 rows x 4 columns]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeddings_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "for fp in os.listdir(data_path + 'images/'):\n",
    "    if '?' in fp:\n",
    "        print(fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df.to_csv('random.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "disco-io/data\n"
     ]
    }
   ],
   "source": [
    "import utils\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from utils import get_immediate_subdirectories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "disco-io/data\n",
      "Refreshing all datasets: ['test']\n"
     ]
    }
   ],
   "source": [
    "utils.refresh_all_datasets()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'test'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "utils.cur_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "disco-io/data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['test']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_immediate_subdirectories('data')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import fs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_path = 'data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_full_path = f\"{utils.S3_BUCKET}/{s3_path}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['disco-io/data/Cvlsntdjgrnuyrlf.jpg', 'disco-io/data/test']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fs.glob(f\"{s3_full_path}/*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fs.isdir('disco-io/data/test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  },
  "vscode": {
   "interpreter": {
    "hash": "e85fcd8d0dbb45c39d3e544566c77318961c8114425a16ff4cb5c14067743b34"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}