File size: 3,109 Bytes
db02cae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import spacy
import unicodedata
import re
import streamlit as st
from sentence_transformers import SentenceTransformer, util
# Preprocess the text
def clean_text(text):
# Normalize Unicode characters
text = unicodedata.normalize('NFKC', text)
# Replace non-breaking or zero-width spaces with regular spaces
text = text.replace('\u200a', ' ').replace('\u00a0', ' ')
return text
# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Load and clean the text file
with open('long-story.txt', encoding='utf-8') as f:
text = f.read()
cleaned_text = clean_text(text)
# Define the regular expression to match titles and the content between them
pattern = r"\n\n([A-Z\s]+)\n([^\n]+(?:\n[^\n]+)*)"
# Use findall to capture all title-content pairs
sections = re.findall(pattern, cleaned_text)
# Store the sections in a dictionary with embeddings for content
sections_dict = {}
embeddings_dict = {}
for title, content in sections:
# Clean the title and content by stripping unnecessary newlines
cleaned_title = title.strip().lower() # Convert the title to lowercase for case-insensitive matching
cleaned_content = content.strip()
# Create embeddings for each section content
content_embedding = model.encode(cleaned_content, convert_to_tensor=True)
sections_dict[cleaned_title] = cleaned_content
embeddings_dict[cleaned_title] = content_embedding
# Function to retrieve content based on user input (semantic matching with SBERT)
def get_section(user_input):
# Normalize user input to lowercase for matching
user_input = user_input.lower()
# Generate the embedding for the user input
user_input_embedding = model.encode(user_input, convert_to_tensor=True)
best_match = None
best_score = -1
# Iterate through the sections to find the best match based on cosine similarity
for title, section_embedding in embeddings_dict.items():
cosine_score = util.pytorch_cos_sim(user_input_embedding, section_embedding)[0][0].item()
if cosine_score > best_score:
best_score = cosine_score
best_match = title
# Return the best matching section
if best_score > 0.5: # You can adjust the threshold based on your needs
return sections_dict[best_match]
else:
return "No matching section found."
# Streamlit UI
def chatbot_ui():
st.title("Text-Based Chatbot")
# Display instructions
st.write("Ask the chatbot for specific sections from the document by typing keywords like 'productivity', 'life hacks', 'communication skills', 'skill development', 'personal development', 'goal setting' ")
# Input field for the user
user_input = st.text_input("Enter a keyword", "")
# If the user enters a keyword, get the matching section and display it
if user_input:
section_content = get_section(user_input)
st.write(section_content)
# Run the Streamlit app
if __name__ == "__main__":
chatbot_ui()
|