import streamlit as st from bs4 import BeautifulSoup from langchain.embeddings import HuggingFaceEmbeddings import pickle import torch import io class CPU_Unpickler(pickle.Unpickler): def find_class(self, module, name): if module == 'torch.storage' and name == '_load_from_bytes': return lambda b: torch.load(io.BytesIO(b), map_location='cpu') else: return super().find_class(module, name) @st.cache_resource def get_hugging_face_model(): model_name = "mchochlov/codebert-base-cd-ft" hf = HuggingFaceEmbeddings(model_name=model_name) return hf @st.cache_resource def get_db(): with open("codesearchdb.pickle", "rb") as f: db = CPU_Unpickler(f).load() return db def get_similar_links(query, db, embeddings): embedding_vector = embeddings.embed_query(query) docs_and_scores = db.similarity_search_by_vector(embedding_vector) hrefs = [] for docs in docs_and_scores: html_doc = docs.page_content soup = BeautifulSoup(html_doc, 'html.parser') href = [a['href'] for a in soup.find_all('a', href=True)] hrefs.append(href) links = [] for href_list in hrefs: for link in href_list: links.append(link) return links embedding_vector = get_hugging_face_model() db = get_db() st.title("📒 DSASearch Engine 🤖 ") text_input = st.text_input("Enter some text") button = st.button("Find Similar Questions on Leetcode") if text_input: query = text_input answer = get_similar_links(query, db, embedding_vector) for link in answer: st.write(link) st.balloons() else: st.info("Please Input Valid Text") st.markdown(""" ### Created by Ashwin Rachha. Source Data : https://github.com/AshwinRachha/LeetCode-Solutions Medium Blog : https://medium.com/@ashwin_rachha/querying-a-code-database-to-find-similar-coding-problems-using-langchain-814730da6e6d """)