{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3e7b6247", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-06-29 09:08:24,868] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please run\n", "\n", "python -m bitsandbytes\n", "\n", " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "bin /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so\n", "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n", "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n", "CUDA SETUP: Detected CUDA version 118\n", "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so'), PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward.\n", "Either way, this might cause trouble in the future:\n", "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", " warn(msg)\n" ] } ], "source": [ "import argparse\n", "import json\n", "import logging\n", "import math\n", "import os\n", "import random\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "\n", "import datasets\n", "from datasets import load_dataset, DatasetDict\n", "\n", "import evaluate\n", "import torch\n", "from torch import nn\n", "from torch.utils.data import DataLoader\n", "\n", "import transformers\n", "from transformers import AutoTokenizer, AutoModel, default_data_collator, SchedulerType, get_scheduler\n", "from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry\n", "from transformers.utils.versions import require_version\n", "\n", "from huggingface_hub import Repository, create_repo\n", "\n", "from accelerate import Accelerator\n", "from accelerate.logging import get_logger\n", "from accelerate.utils import set_seed\n", "\n", "from peft import PeftModel\n", "\n", "import hnswlib" ] }, { "cell_type": "code", "execution_count": 2, "id": "c939b4fd", "metadata": {}, "outputs": [], "source": [ "class AutoModelForSentenceEmbedding(nn.Module):\n", " def __init__(self, model_name, tokenizer, normalize=True):\n", " super(AutoModelForSentenceEmbedding, self).__init__()\n", "\n", " self.model = AutoModel.from_pretrained(model_name) # , load_in_8bit=True, device_map={\"\":0})\n", " self.normalize = normalize\n", " self.tokenizer = tokenizer\n", "\n", " def forward(self, **kwargs):\n", " model_output = self.model(**kwargs)\n", " embeddings = self.mean_pooling(model_output, kwargs[\"attention_mask\"])\n", " if self.normalize:\n", " embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)\n", "\n", " return embeddings\n", "\n", " def mean_pooling(self, model_output, attention_mask):\n", " token_embeddings = model_output[0] # First element of model_output contains all token embeddings\n", " input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n", "\n", " def __getattr__(self, name: str):\n", " \"\"\"Forward missing attributes to the wrapped module.\"\"\"\n", " try:\n", " return super().__getattr__(name) # defer to nn.Module's logic\n", " except AttributeError:\n", " return getattr(self.model, name)\n", "\n", "\n", "def get_cosing_embeddings(query_embs, product_embs):\n", " return torch.sum(query_embs * product_embs, axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "id": "8b5d9256", "metadata": {}, "outputs": [], "source": [ "model_name_or_path = \"intfloat/e5-large-v2\"\n", "peft_model_id = \"smangrul/peft_lora_e5_semantic_search\"\n", "dataset_name = \"smangrul/amazon_esci\"\n", "max_length = 70\n", "batch_size = 256" ] }, { "cell_type": "code", "execution_count": 4, "id": "f190e1ee", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset parquet (/raid/sourab/.cache/huggingface/datasets/smangrul___parquet/smangrul--amazon_esci-321288cabf0cc045/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "43b84641575e4ce6899a3e6f61d7e126", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexproduct_title
00RamPro 10\" All Purpose Utility Air Tires/Wheel...
11MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...
22NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...
332PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...
44(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...
.........
476273476273Chanel No.5 Eau Premiere Spray 50ml/1.7oz
476274476274Steve Madden Designer 15 Inch Carry on Suitcas...
476275476275CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce
476276476276Coco Mademoiselle by Chanel for Women - 3.4 oz...
476277476277Chânél No. 5 by Chânél Eau De Parfum Premiere ...
\n", "

476278 rows × 2 columns

\n", "" ], "text/plain": [ " index product_title\n", "0 0 RamPro 10\" All Purpose Utility Air Tires/Wheel...\n", "1 1 MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...\n", "2 2 NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...\n", "3 3 2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...\n", "4 4 (Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...\n", "... ... ...\n", "476273 476273 Chanel No.5 Eau Premiere Spray 50ml/1.7oz\n", "476274 476274 Steve Madden Designer 15 Inch Carry on Suitcas...\n", "476275 476275 CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce\n", "476276 476276 Coco Mademoiselle by Chanel for Women - 3.4 oz...\n", "476277 476277 Chânél No. 5 by Chânél Eau De Parfum Premiere ...\n", "\n", "[476278 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "product_dataset_for_indexing" ] }, { "cell_type": "code", "execution_count": 6, "id": "85840ec6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexproduct_title
3471034710ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3
277590277590WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack)
474000474000iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights
1899718997USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More
208666208666AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold
326614326614CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids
105637105637Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion
342392342392chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve)
319970319970AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256)
416956416956Timberland HIKER-ROUND 54 BROWN
\n", "
" ], "text/plain": [ " index \\\n", "34710 34710 \n", "277590 277590 \n", "474000 474000 \n", "18997 18997 \n", "208666 208666 \n", "326614 326614 \n", "105637 105637 \n", "342392 342392 \n", "319970 319970 \n", "416956 416956 \n", "\n", " product_title \n", "34710 ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3 \n", "277590 WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack) \n", "474000 iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights \n", "18997 USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More \n", "208666 AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold \n", "326614 CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids \n", "105637 Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion \n", "342392 chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve) \n", "319970 AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256) \n", "416956 Timberland HIKER-ROUND 54 BROWN " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option(\"max_colwidth\", 300)\n", "product_dataset_for_indexing.sample(10)" ] }, { "cell_type": "code", "execution_count": 7, "id": "408b6e00", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Running tokenizer on dataset: 0%| | 0/476278 [00:00 k\n", "\n", " # Query dataset, k - number of the closest elements (returns 2 numpy arrays)\n", " labels, distances = search_index.knn_query(query_embeddings, k=k)\n", "\n", " return [\n", " (ids_to_products_dict[label], (1 - distance))\n", " for label, distance in zip(labels[0], distances[0])\n", " if (1 - distance) >= threshold\n", " ]" ] }, { "cell_type": "code", "execution_count": 97, "id": "1c47f12d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "query='NLP and ML books'\n", "cosine_sim_score=0.92 product='Machine Learning: A Journey from Beginner to Advanced Including Deep Learning, Scikit-learn and Tensorflow'\n", "cosine_sim_score=0.91 product='Mastering Machine Learning with scikit-learn'\n", "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", "cosine_sim_score=0.91 product='Practical Deep Learning: A Python-Based Introduction'\n", "cosine_sim_score=0.9 product='Machine Learning: A Hands-On, Project-Based Introduction to Machine Learning for Absolute Beginners: Mastering Engineering ML Systems using Scikit-Learn and TensorFlow'\n", "cosine_sim_score=0.9 product='Mastering Machine Learning with scikit-learn - Second Edition: Apply effective learning algorithms to real-world problems using scikit-learn'\n", "cosine_sim_score=0.9 product='Mastering Machine Learning on AWS: Advanced machine learning in Python using SageMaker, Apache Spark, and TensorFlow'\n", "cosine_sim_score=0.9 product='Machine Learning Algorithms: Naive Bayes'\n", "cosine_sim_score=0.9 product='Fundamentals of Machine Learning for Predictive Data Anayltics: Algorithms, Worked Examples, and Case Studies'\n" ] } ], "source": [ "query = \"NLP and ML books\"\n", "k = 10\n", "query_embeddings = get_query_embeddings(query, model, tokenizer, device)\n", "search_results = get_nearest_neighbours(k, product_search_index, query_embeddings, ids_to_products_dict, threshold=0.7)\n", "\n", "print(f\"{query=}\")\n", "for product, cosine_sim_score in search_results:\n", " print(f\"cosine_sim_score={round(cosine_sim_score,2)} {product=}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e9e2dd2c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }