Spaces:
Runtime error
Runtime error
im
commited on
Commit
·
e9755d9
1
Parent(s):
4bb4754
add embeddings explanation and dimensionality reduction explanation
Browse files- app.py +282 -23
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
# TODO: move to 'utils'
|
4 |
mystyle = '''
|
@@ -15,6 +16,11 @@ def divider():
|
|
15 |
_, c, _ = st.columns(3)
|
16 |
c.divider()
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
st.title("Transformers: Tokenisers and Embeddings")
|
19 |
|
20 |
preface_image, preface_text, = st.columns(2)
|
@@ -288,7 +294,7 @@ elif tokeniser_name == 'Unigram':
|
|
288 |
according to their probabilities.
|
289 |
""")
|
290 |
|
291 |
-
st.subheader("Try Yourself:")
|
292 |
st.write(f"""\
|
293 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
294 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
@@ -358,7 +364,7 @@ elif tokeniser_name == 'WordPiece':
|
|
358 |
it.
|
359 |
""")
|
360 |
|
361 |
-
st.subheader("Try Yourself:")
|
362 |
st.write(f"""\
|
363 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
364 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
@@ -472,11 +478,17 @@ st.write("""\
|
|
472 |
characteristics using numbers, not words.
|
473 |
""")
|
474 |
|
475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
476 |
|
477 |
-
col1, col2 = st.columns(2)
|
478 |
-
token_king = col1.text_input("Choose a word to compare embeddings:", value="king")
|
479 |
-
token_queen = col2.text_input("Choose a word to compare embeddings:", value="queen")
|
480 |
|
481 |
from torch import nn
|
482 |
from transformers import AutoConfig
|
@@ -502,28 +514,61 @@ openai.api_key = st.secrets["OPENAI_API_KEY"]
|
|
502 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
503 |
EMBEDDING_CTX_LENGTH = 8191
|
504 |
EMBEDDING_ENCODING = 'cl100k_base'
|
505 |
-
king =
|
506 |
-
queen =
|
507 |
|
508 |
-
|
509 |
-
|
|
|
510 |
fig.update_layout(legend=dict(orientation="h"))
|
511 |
st.plotly_chart(fig, use_container_width=True)
|
512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
-
df = pd.DataFrame({f'"{token_king}" embeddings': king
|
515 |
-
fig = px.line(df, title="OpenAI's 'text-embedding-ada-002' model embeddings")
|
516 |
fig.update_layout(legend=dict(orientation="h"))
|
517 |
st.plotly_chart(fig, use_container_width=True)
|
518 |
|
519 |
|
520 |
-
|
|
|
|
|
|
|
|
|
521 |
|
522 |
-
|
523 |
-
|
|
|
524 |
|
525 |
-
|
526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
|
528 |
input = {word: get_embeddings(word) for word in sentence}
|
529 |
|
@@ -534,24 +579,238 @@ for i, word_i in enumerate(sentence):
|
|
534 |
|
535 |
fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
|
536 |
fig.update_layout(coloraxis_showscale=False)
|
537 |
-
fig.update_layout(width=6000
|
538 |
st.plotly_chart(fig, use_container_width=True)
|
539 |
|
|
|
|
|
540 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
541 |
from langchain.vectorstores import FAISS
|
542 |
from langchain.schema.document import Document
|
543 |
-
db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
embeddings_query = st.text_input(label="search term")
|
546 |
if embeddings_query is not None and embeddings_query != '':
|
547 |
-
|
548 |
-
docs
|
549 |
-
st.write(docs[0].page_content)
|
550 |
|
551 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
552 |
|
553 |
with st.expander("References:"):
|
554 |
st.write("""\
|
555 |
- https://huggingface.co/blog/getting-started-with-embeddings
|
556 |
- https://huggingface.co/blog/1b-sentence-embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
""")
|
|
|
1 |
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
|
4 |
# TODO: move to 'utils'
|
5 |
mystyle = '''
|
|
|
16 |
_, c, _ = st.columns(3)
|
17 |
c.divider()
|
18 |
|
19 |
+
@st.cache_data
|
20 |
+
def get_embeddings(text):
|
21 |
+
return np.array(openai.Embedding.create(input=text, model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
22 |
+
|
23 |
+
|
24 |
st.title("Transformers: Tokenisers and Embeddings")
|
25 |
|
26 |
preface_image, preface_text, = st.columns(2)
|
|
|
294 |
according to their probabilities.
|
295 |
""")
|
296 |
|
297 |
+
st.subheader(":green[Try Yourself:]")
|
298 |
st.write(f"""\
|
299 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
300 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
|
364 |
it.
|
365 |
""")
|
366 |
|
367 |
+
st.subheader(":green[Try Yourself:]")
|
368 |
st.write(f"""\
|
369 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
370 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
|
478 |
characteristics using numbers, not words.
|
479 |
""")
|
480 |
|
481 |
+
st.write("""\
|
482 |
+
Let's explore embeddings in more detail. We can take an experimental approach by encoding two specific
|
483 |
+
words and examining the corresponding embedding vectors they generate. To make our exploration more accessible,
|
484 |
+
we'll visualise a portion of these vectors, thereby unveiling the underlying structure of embeddings. Pay attention
|
485 |
+
to common patterns and peaks, try to find two words that yield differing embeddings.
|
486 |
+
""")
|
487 |
+
col1, col2, col3 = st.columns(3)
|
488 |
+
token_king = col1.text_input("Choose a word:", value="king")
|
489 |
+
token_queen = col2.text_input("Choose a word:", value="queen")
|
490 |
+
token_dots = col3.number_input("Number of dots:", value=50, min_value=0, max_value=1536)
|
491 |
|
|
|
|
|
|
|
492 |
|
493 |
from torch import nn
|
494 |
from transformers import AutoConfig
|
|
|
514 |
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
515 |
EMBEDDING_CTX_LENGTH = 8191
|
516 |
EMBEDDING_ENCODING = 'cl100k_base'
|
517 |
+
king = get_embeddings(token_king)
|
518 |
+
queen = get_embeddings(token_queen)
|
519 |
|
520 |
+
|
521 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np, f'"{token_queen}" embeddings': queen_emb_np})
|
522 |
+
fig = px.line(df[:token_dots], title=f"Google's 'bert-base-uncased' model embeddings, embedding vector size: {len(queen_emb_np)}")
|
523 |
fig.update_layout(legend=dict(orientation="h"))
|
524 |
st.plotly_chart(fig, use_container_width=True)
|
525 |
|
526 |
+
with st.expander("Python Code:"):
|
527 |
+
st.code(f"""\
|
528 |
+
from torch import nn
|
529 |
+
from transformers import AutoConfig
|
530 |
+
|
531 |
+
model_ckpt = 'bert-base-uncased'
|
532 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
533 |
+
king_id = tokenizer("{token_king}", return_tensors="pt", add_special_tokens=False)
|
534 |
+
queen_id = tokenizer("{token_queen}", return_tensors="pt", add_special_tokens=False)
|
535 |
+
|
536 |
+
config = AutoConfig.from_pretrained(model_ckpt)
|
537 |
+
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
|
538 |
+
king_embeddings = token_emb(king_id.input_ids)
|
539 |
+
queen_embeddings = token_emb(queen_id.input_ids)
|
540 |
+
""")
|
541 |
|
542 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king, f'"{token_queen}" embeddings': queen})
|
543 |
+
fig = px.line(df[:token_dots], title=f"OpenAI's 'text-embedding-ada-002' model embeddings, embedding vector size: {len(queen)}")
|
544 |
fig.update_layout(legend=dict(orientation="h"))
|
545 |
st.plotly_chart(fig, use_container_width=True)
|
546 |
|
547 |
|
548 |
+
with st.expander("Python Code:"):
|
549 |
+
st.code(f"""\
|
550 |
+
import openai
|
551 |
+
|
552 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
553 |
|
554 |
+
king_embeddings = np.array(openai.Embedding.create(input="{token_king}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
555 |
+
queen_embeddings = np.array(openai.Embedding.create(input="{token_queen}", model=EMBEDDING_MODEL)["data"][0]["embedding"])
|
556 |
+
""")
|
557 |
|
558 |
+
st.write("""\
|
559 |
+
The similarity can be represented as a similarity score. Identical words naturally have the highest
|
560 |
+
score (black colours), while unrelated terms have lower scores (white colours). To compute this score,
|
561 |
+
we construct a matrix infused with our embedding vectors. Each row in this matrix corresponds to a unique word in the
|
562 |
+
sentence, while each column aligns with another word. The value at the intersection of row i and column j represents
|
563 |
+
the score between word i and word j. For a clearer understanding, let's visualise this matrix using a heatmap. Each
|
564 |
+
cell in the grid corresponds to a pair of words, and the colour of the cell indicates the similarity (correlation)
|
565 |
+
score between those two words. The intensity of the colour directly corresponds to the magnitude of the score - the
|
566 |
+
darker the hue, the higher the score.
|
567 |
+
""")
|
568 |
+
|
569 |
+
st.write("""Here is a heatmap of the score matrix for the sentence:""")
|
570 |
+
sentence = st.text_input(label="*words to explore embeddings*", value="a the king queen space sit eat from on")
|
571 |
+
sentence = sentence.split()
|
572 |
|
573 |
input = {word: get_embeddings(word) for word in sentence}
|
574 |
|
|
|
579 |
|
580 |
fig = px.imshow(scores_matrix, x=sentence, y=sentence, color_continuous_scale="hot_r")
|
581 |
fig.update_layout(coloraxis_showscale=False)
|
582 |
+
fig.update_layout(width=6000)
|
583 |
st.plotly_chart(fig, use_container_width=True)
|
584 |
|
585 |
+
st.subheader(":green[Try Yourself:]")
|
586 |
+
|
587 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
588 |
from langchain.vectorstores import FAISS
|
589 |
from langchain.schema.document import Document
|
|
|
590 |
|
591 |
+
@st.cache_resource
|
592 |
+
def create_vector_database():
|
593 |
+
return FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
594 |
+
db = create_vector_database()
|
595 |
+
|
596 |
+
@st.cache_data
|
597 |
+
def search_vector_database(term):
|
598 |
+
embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query(term)
|
599 |
+
docs = db.similarity_search_by_vector(embedding_vector)
|
600 |
+
return docs
|
601 |
+
|
602 |
+
st.write("""\
|
603 |
+
*There is a vector database containing two words: 'king' and 'queen'. Your task is to pinpoint search
|
604 |
+
terms that would yield either of these words. To facilitate this, use the previously presented similarity matrix to
|
605 |
+
seek out words that give a higher correlation with the word in question. For instance, you might want to explore
|
606 |
+
terms such as 'king', 'queen', 'dog', 'prince', 'man', 'minister', 'boy'.*
|
607 |
+
""")
|
608 |
embeddings_query = st.text_input(label="search term")
|
609 |
if embeddings_query is not None and embeddings_query != '':
|
610 |
+
docs = search_vector_database(embeddings_query)
|
611 |
+
st.warning(docs[0].page_content)
|
|
|
612 |
|
613 |
+
with st.expander("Python Code:"):
|
614 |
+
st.code(f"""\
|
615 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
616 |
+
from langchain.vectorstores import FAISS
|
617 |
+
from langchain.schema.document import Document
|
618 |
+
|
619 |
+
|
620 |
+
db = FAISS.from_documents([Document(page_content="king"), Document(page_content="queen")], OpenAIEmbeddings(model=EMBEDDING_MODEL))
|
621 |
+
embedding_vector = OpenAIEmbeddings(model=EMBEDDING_MODEL).embed_query("{embeddings_query}")
|
622 |
+
docs = db.similarity_search_by_vector(embedding_vector)
|
623 |
+
""")
|
624 |
+
|
625 |
+
divider()
|
626 |
+
st.caption("Conclusion")
|
627 |
+
st.write("""\
|
628 |
+
As embedding algorithms are trained on a vast corpus of data, they inherently encapsulate a rich
|
629 |
+
tapestry of information about our language and even the world at large. Therefore, they can be used for:
|
630 |
+
|
631 |
+
- Search (where results are ranked by relevance to a query string)
|
632 |
+
- Clustering (where text strings are grouped by similarity)
|
633 |
+
- Recommendations (where items with related text strings are recommended)
|
634 |
+
- Anomaly detection (where outliers with little relatedness are identified)
|
635 |
+
- Diversity measurement (where similarity distributions are analyzed)
|
636 |
+
- Classification (where text strings are classified by their most similar label)
|
637 |
+
""")
|
638 |
|
639 |
with st.expander("References:"):
|
640 |
st.write("""\
|
641 |
- https://huggingface.co/blog/getting-started-with-embeddings
|
642 |
- https://huggingface.co/blog/1b-sentence-embeddings
|
643 |
+
- https://platform.openai.com/docs/guides/embeddings/use-cases
|
644 |
+
""")
|
645 |
+
|
646 |
+
divider()
|
647 |
+
st.header("Dimensionality Reduction (optional)")
|
648 |
+
|
649 |
+
|
650 |
+
st.write("""\
|
651 |
+
As was mentioned above, embedding vectors are learned in such a way that words with similar meanings
|
652 |
+
are located close to each other in the space. However, this is an abstract concept that might be difficult to
|
653 |
+
explore, understand and visualise in a 2D space because word embeddings typically have hundreds of dimensions. To
|
654 |
+
solve this, we can use techniques like Principal Component Analysis (PCA) or t-SNE to reduce the dimensionality of
|
655 |
+
the vectors and plot them.
|
656 |
+
""")
|
657 |
+
st.write("""But first, let's talk about the meaning of dimensionality reduction using simplified use-case:""")
|
658 |
+
|
659 |
+
dimensionality_name = st.selectbox(label="Choose your example", options=["Simplified", "PCA", 't-SNE'])
|
660 |
+
if dimensionality_name == 'Simplified':
|
661 |
+
_, col2, _ = st.columns(3)
|
662 |
+
col2.image("assets/img.png")
|
663 |
+
st.write("""\
|
664 |
+
**Step 1: The context**\n
|
665 |
+
We have a 3D object (your hand) and a light source that's casting a 2D shadow of your hand onto a
|
666 |
+
wall. The shadow is a simpler, lower-dimensional representation of your hand.
|
667 |
+
|
668 |
+
**Step 2: Identifying the dimensions**\n
|
669 |
+
In this case, the dimensions are the different aspects of your hand that can be
|
670 |
+
observed: the length of your fingers, the width of your palm, the height (or depth) of your hand, the scars,
|
671 |
+
the colour of the skin, etc. However, we have a problem: we can't easily visualise or understand all these dimensions
|
672 |
+
at once. Just as it's hard to imagine a 6-dimensional space.
|
673 |
+
|
674 |
+
**Step 3: Deciding on important dimensions**\n
|
675 |
+
Let's say you want to compare the number of fingers of different hands. In
|
676 |
+
this case, you don't need to know about the depth of the hand, the width of the palm, or other details like freckles,
|
677 |
+
scars, or skin colour. You just need a shadow that clearly shows the fingers. So, you decide to focus on the length
|
678 |
+
of the fingers, which can be easily shown in the shadow.
|
679 |
+
|
680 |
+
**Step 4: Reducing dimensions**\n
|
681 |
+
This is where you actually perform dimensionality reduction. You orient your hand in such
|
682 |
+
a way (giving the wall a high-five) that the shadow clearly shows the fingers. You've effectively reduced the
|
683 |
+
dimensions from 3D to 2D. Your hand is still a 3D object, but its shadow — the simplified representation you're using
|
684 |
+
for your comparison — is 2D.
|
685 |
+
|
686 |
+
**Step 5: Interpretation**\n
|
687 |
+
This hand and shadow example shows how dimensionality reduction simplifies a complex object (
|
688 |
+
the 3D hand) into a lower-dimensional representation (the 2D shadow) that retains the most important information (the
|
689 |
+
number of fingers) while discarding the less important details (like the depth of the hand, skin colour, etc.). It's
|
690 |
+
a process of prioritisation and simplification that makes it easier for us to understand and analyse the data (or the
|
691 |
+
hands, in this case).
|
692 |
+
""")
|
693 |
+
elif dimensionality_name == 'PCA':
|
694 |
+
st.write("""\
|
695 |
+
**Step 1: Understanding PCA**\n
|
696 |
+
PCA is a popular method for dimensionality reduction. It identifies the
|
697 |
+
axes in the feature space along which the original data varies the most. These axes are known as the principal
|
698 |
+
components, and they are orthogonal (perpendicular) to each other.
|
699 |
+
|
700 |
+
**Step 2: Projecting the Data**\n
|
701 |
+
Imagine that instead of just casting a shadow on the wall, you can cast your hand's
|
702 |
+
shadow onto a number of walls arranged at different angles around your hand. Each shadow is a different projection of
|
703 |
+
your hand. In PCA, these different walls represent different principal components, and the shadow on each wall is a
|
704 |
+
projection of your hand onto that principal component.
|
705 |
+
|
706 |
+
**Step 3: Choosing the Best Projection**\n
|
707 |
+
Now, consider the shadow that most accurately portrays the number of fingers on
|
708 |
+
your hand. This shadow corresponds to the principal component that captures the most variance in the data. In PCA,
|
709 |
+
this would be the first principal component.
|
710 |
+
|
711 |
+
**Step 4: Secondary Features**\n
|
712 |
+
Next, consider the shadow that, while not as accurate as the first, still gives a
|
713 |
+
reasonable representation of your hand, such as showing the width of your palm. This shadow represents the second
|
714 |
+
principal component, which captures the second highest amount of variance in the data.
|
715 |
+
|
716 |
+
**Step 5: Reduction of Dimensions**\n
|
717 |
+
In the process of reducing dimensions, we select the top few principal components (
|
718 |
+
shadows) that capture the most variance. The other dimensions (shadows) are discarded. So, instead of having to
|
719 |
+
consider the complex 3D structure of your hand, you can simply look at one or two shadows that give you the most
|
720 |
+
information about the hand.
|
721 |
+
|
722 |
+
**Step 6: Transformation**\n
|
723 |
+
Finally, we transform the original data into the reduced dimensional space defined by the
|
724 |
+
selected principal components. This is analogous to replacing each hand with the selected shadows for further analysis.
|
725 |
+
By using PCA, we can reduce the complexity of the data (from a 3D hand to a 2D or even 1D shadow), while still
|
726 |
+
retaining the most important information (like the number of fingers or the width of the palm). This makes the data
|
727 |
+
easier to visualize, understand, and work with.
|
728 |
+
""")
|
729 |
+
embedding_dim = 1536
|
730 |
+
embeddings = st.text_input("words to explore:",
|
731 |
+
value="king queen man woman prince prince princess counselor minister teacher")
|
732 |
+
embeddings = embeddings.split()
|
733 |
+
embeddings = {word: get_embeddings(word) for word in embeddings}
|
734 |
+
|
735 |
+
from sklearn.decomposition import PCA
|
736 |
+
|
737 |
+
pca = PCA(n_components=2)
|
738 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
739 |
+
reduced_embeddings = pca.fit_transform(embedding_matrix)
|
740 |
+
|
741 |
+
df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
|
742 |
+
df["Word"] = list(embeddings.keys())
|
743 |
+
fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
|
744 |
+
st.plotly_chart(fig, use_container_width=True)
|
745 |
+
|
746 |
+
st.code(f"""\
|
747 |
+
from sklearn.decomposition import PCA
|
748 |
+
|
749 |
+
pca = PCA(n_components=2)
|
750 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
751 |
+
reduced_embeddings = pca.fit_transform(embedding_matrix)
|
752 |
+
""", language='python')
|
753 |
+
|
754 |
+
elif dimensionality_name == 't-SNE':
|
755 |
+
st.write("""\
|
756 |
+
**Step 1: Understanding t-SNE**\n
|
757 |
+
t-SNE is a technique for dimensionality reduction that is particularly
|
758 |
+
well-suited for the visualization of high-dimensional datasets. Unlike PCA, which is a linear technique,
|
759 |
+
t-SNE is a non-linear technique, making it better at capturing complex polynomial relationships between variables.
|
760 |
+
|
761 |
+
**Step 2: Measuring Similarities**\n
|
762 |
+
Imagine that instead of just one hand, you have many hands casting shadows. Each hand
|
763 |
+
is different - some hands might have longer fingers, some might have a wider palm, and so on. Each hand has its own
|
764 |
+
"neighborhood" of similar hands. In t-SNE, these neighborhoods are represented mathematically by a probability
|
765 |
+
distribution. Hands that are very similar to each other have a high probability of being "neighbors", while hands
|
766 |
+
that are very different have a low probability.
|
767 |
+
|
768 |
+
**Step 3: Creating a Map**\n
|
769 |
+
t-SNE creates a map (or a projection) where hands that were close in the high-dimensional
|
770 |
+
space (similar hands) are still close in the low-dimensional space (in their shadows), and hands that were far apart
|
771 |
+
in the high-dimensional space (different hands) are still far apart in the low-dimensional space. This map is created
|
772 |
+
in such a way that it minimizes the difference between the distances in the high-dimensional space and the distances
|
773 |
+
in the low-dimensional space.
|
774 |
+
|
775 |
+
**Step 4: Reducing Dimensions**\n
|
776 |
+
The process of reducing dimensions in t-SNE involves optimizing the locations of each
|
777 |
+
hand's shadow in the low-dimensional space such that the overall configuration of shadows best represents the
|
778 |
+
similarities between the hands in the high-dimensional space.
|
779 |
+
|
780 |
+
**Step 5: Interpretation**\n
|
781 |
+
The result of t-SNE is a map where similar hands are located close together and dissimilar
|
782 |
+
hands are located far apart. This makes it easier to visualize clusters or groups of similar hands.
|
783 |
+
t-SNE, therefore, helps us to project high-dimensional data into a lower-dimensional space in a way that preserves
|
784 |
+
the structure of the data as much as possible, making it easier to visualize and understand the relationships in the
|
785 |
+
data.
|
786 |
+
""")
|
787 |
+
embedding_dim = 1536
|
788 |
+
embeddings = st.text_input("words to explore:",
|
789 |
+
value="king queen man woman prince prince princess counselor minister teacher")
|
790 |
+
embeddings = embeddings.split()
|
791 |
+
embeddings = {word: get_embeddings(word) for word in embeddings}
|
792 |
+
|
793 |
+
from sklearn.manifold import TSNE
|
794 |
+
|
795 |
+
tsne = TSNE(n_components=2, perplexity=2, random_state=0)
|
796 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
797 |
+
reduced_embeddings = tsne.fit_transform(embedding_matrix)
|
798 |
+
|
799 |
+
df = pd.DataFrame(reduced_embeddings, columns=["X", "Y"])
|
800 |
+
df["Word"] = list(embeddings.keys())
|
801 |
+
fig = px.scatter(df, x="X", y="Y", text="Word", title="Word Embeddings", width=800, height=800)
|
802 |
+
st.plotly_chart(fig, use_container_width=True)
|
803 |
+
|
804 |
+
st.code(f"""\
|
805 |
+
from sklearn.manifold import TSNE
|
806 |
+
|
807 |
+
tsne = TSNE(n_components=2, perplexity=2, random_state=0)
|
808 |
+
embedding_matrix = np.array(list(embeddings.values()))
|
809 |
+
reduced_embeddings = tsne.fit_transform(embedding_matrix)
|
810 |
+
""", language='python')
|
811 |
+
|
812 |
+
with st.expander("References:"):
|
813 |
+
st.write("""\
|
814 |
+
- https://hex.tech/blog/dimensionality-reduction/
|
815 |
+
- https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb
|
816 |
""")
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ openai~=0.27.8
|
|
6 |
plotly~=5.15.0
|
7 |
langchain~=0.0.242
|
8 |
faiss-cpu~=1.7.4
|
9 |
-
tiktoken~=0.4.0
|
|
|
|
6 |
plotly~=5.15.0
|
7 |
langchain~=0.0.242
|
8 |
faiss-cpu~=1.7.4
|
9 |
+
tiktoken~=0.4.0
|
10 |
+
scikit-learn~=1.3.0
|