Spaces:

vespa-engine
/

colpali-vespa-visual-retrieval

Running on L40S

App Files Files Community

thomasht86 commited on Oct 23, 2024

Commit

b7897bb

verified ·

1 Parent(s): bb4f59a

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.DS_Store +0 -0
.env.example +2 -1
.gitignore +3 -1
README.md +20 -1
backend/colpali.py +25 -7
frontend/app.py +46 -13
frontend/layout.py +95 -14
globals.css +22 -1
main.py +106 -19
output.css +77 -1
prepare_feed_deploy.py +977 -0
pyproject.toml +10 -1
static/.DS_Store +0 -0
uv.lock +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.env.example CHANGED Viewed

@@ -1,3 +1,4 @@
 VESPA_APP_URL=https://abcde.z.vespa-app.cloud
 HF_TOKEN=hf_xxxxxxxxxx
-VESPA_CLOUD_SECRET_TOKEN=vespa_cloud_xxxxxxxx

 VESPA_APP_URL=https://abcde.z.vespa-app.cloud
 HF_TOKEN=hf_xxxxxxxxxx
+VESPA_CLOUD_SECRET_TOKEN=vespa_cloud_xxxxxxxx
+GEMINI_API_KEY=

.gitignore CHANGED Viewed

@@ -1,8 +1,10 @@
 .sesskey
 .venv/
 __pycache__/
 .python-version
 .env
 template/
 *.json
-output/

 .sesskey
 .venv/
 __pycache__/
+ipynb_checkpoints/
 .python-version
 .env
 template/
 *.json
+output/
+pdfs/

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ preload_from_hub:
 # Visual Retrieval ColPali
-# Developing
 First, install `uv`:
@@ -35,6 +35,25 @@ First, install `uv`:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
 Then, in this directory, run:
 ```bash

 # Visual Retrieval ColPali
+# Prepare data and Vespa application
 First, install `uv`:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+Then, run:
+```bash
+uv sync --extra dev --extra feed
+```
+Convert the `prepare_feed_deploy.py` to notebook to:
+```bash
+jupytext --to notebook prepare_feed_deploy.py
+```
+And launch a Jupyter instance, see https://docs.astral.sh/uv/guides/integration/jupyter/ for recommended approach.
+Open and follow the `prepare_feed_deploy.ipynb` notebook to prepare the data and deploy the Vespa application.
+# Developing on the web app
 Then, in this directory, run:
 ```bash

backend/colpali.py CHANGED Viewed

@@ -170,13 +170,13 @@ def gen_similarity_maps(
     if vespa_sim_maps:
         print("Using provided similarity maps")
         # A sim map looks like this:
-        # "similarities": [
         #      {
         #        "address": {
         #          "patch": "0",
         #          "querytoken": "0"
         #        },
-        #        "value": 1.2599412202835083
         #      },
         # ... and so on.
         # Now turn these into a tensor of same shape as previous similarity map
@@ -189,7 +189,7 @@ def gen_similarity_maps(
             )
         )
         for idx, vespa_sim_map in enumerate(vespa_sim_maps):
-            for cell in vespa_sim_map["similarities"]["cells"]:
                 patch = int(cell["address"]["patch"])
                 # if dummy model then just use 1024 as the image_seq_length
@@ -359,7 +359,7 @@ async def query_vespa_default(
         start = time.perf_counter()
         response: VespaQueryResponse = await session.query(
             body={
-                "yql": "select id,title,url,full_image,page_number,snippet,text,summaryfeatures from pdf_page where userQuery();",
                 "ranking": "default",
                 "query": query,
                 "timeout": timeout,
@@ -392,7 +392,7 @@ async def query_vespa_bm25(
         start = time.perf_counter()
         response: VespaQueryResponse = await session.query(
             body={
-                "yql": "select id,title,url,full_image,page_number,snippet,text,summaryfeatures from pdf_page where userQuery();",
                 "ranking": "bm25",
                 "query": query,
                 "timeout": timeout,
@@ -472,7 +472,7 @@ async def query_vespa_nearest_neighbor(
                 **query_tensors,
                 "presentation.timing": True,
                 # if we use rank({nn_string}, userQuery()), dynamic summary doesn't work, see https://github.com/vespa-engine/vespa/issues/28704
-                "yql": f"select id,title,snippet,text,url,full_image,page_number,summaryfeatures from pdf_page where {nn_string} or userQuery()",
                 "ranking.profile": "retrieval-and-rerank",
                 "timeout": timeout,
                 "hits": hits,
@@ -492,6 +492,24 @@ def is_special_token(token: str) -> bool:
         return True
     return False
 async def get_result_from_query(
     app: Vespa,
@@ -538,7 +556,7 @@ def add_sim_maps_to_result(
     imgs: List[str] = []
     vespa_sim_maps: List[str] = []
     for single_result in result["root"]["children"]:
-        img = single_result["fields"]["full_image"]
         if img:
             imgs.append(img)
         vespa_sim_map = single_result["fields"].get("summaryfeatures", None)

     if vespa_sim_maps:
         print("Using provided similarity maps")
         # A sim map looks like this:
+        # "quantized": [
         #      {
         #        "address": {
         #          "patch": "0",
         #          "querytoken": "0"
         #        },
+        #        "value": 12, # score in range [-128, 127]
         #      },
         # ... and so on.
         # Now turn these into a tensor of same shape as previous similarity map
             )
         )
         for idx, vespa_sim_map in enumerate(vespa_sim_maps):
+            for cell in vespa_sim_map["quantized"]["cells"]:
                 patch = int(cell["address"]["patch"])
                 # if dummy model then just use 1024 as the image_seq_length
         start = time.perf_counter()
         response: VespaQueryResponse = await session.query(
             body={
+                "yql": "select id,title,url,blur_image,page_number,snippet,text,summaryfeatures from pdf_page where userQuery();",
                 "ranking": "default",
                 "query": query,
                 "timeout": timeout,
         start = time.perf_counter()
         response: VespaQueryResponse = await session.query(
             body={
+                "yql": "select id,title,url,blur_image,page_number,snippet,text,summaryfeatures from pdf_page where userQuery();",
                 "ranking": "bm25",
                 "query": query,
                 "timeout": timeout,
                 **query_tensors,
                 "presentation.timing": True,
                 # if we use rank({nn_string}, userQuery()), dynamic summary doesn't work, see https://github.com/vespa-engine/vespa/issues/28704
+                "yql": f"select id,title,snippet,text,url,blur_image,page_number,summaryfeatures from pdf_page where {nn_string} or userQuery()",
                 "ranking.profile": "retrieval-and-rerank",
                 "timeout": timeout,
                 "hits": hits,
         return True
     return False
+async def get_full_image_from_vespa(
+    app: Vespa,
+    id: str) -> str:
+    async with app.asyncio(connections=1, total_timeout=120) as session:
+        start = time.perf_counter()
+        response: VespaQueryResponse = await session.query(
+            body={
+                "yql": f"select full_image from pdf_page where id contains \"{id}\"",
+                "ranking": "unranked",
+                "presentation.timing": True,
+            },
+        )
+        assert response.is_successful(), response.json
+        stop = time.perf_counter()
+        print(
+            f"Getting image from Vespa took: {stop - start} s, vespa said searchtime was {response.json.get('timing', {}).get('searchtime', -1)} s"
+        )
+    return response.json["root"]["children"][0]["fields"]["full_image"]
 async def get_result_from_query(
     app: Vespa,
     imgs: List[str] = []
     vespa_sim_maps: List[str] = []
     for single_result in result["root"]["children"]:
+        img = single_result["fields"]["blur_image"]
         if img:
             imgs.append(img)
         vespa_sim_map = single_result["fields"].get("summaryfeatures", None)

frontend/app.py CHANGED Viewed

@@ -131,9 +131,13 @@ def SearchBox(with_border=False, query_value="", ranking_value="nn+colpali"):
 def SampleQueries():
     sample_queries = [
-        "Percentage of non-fresh water as source?",
-        "Policies related to nature risk?",
-        "How much of produced water is recycled?",
     ]
     query_badges = []
@@ -193,21 +197,23 @@ def Search(request, search_results=[]):
     )
     return Div(
         Div(
-            SearchBox(query_value=query_value, ranking_value=ranking_value),
             Div(
-                LoadingMessage(),
-                id="search-results",  # This will be replaced by the search results
             ),
             cls="grid",
         ),
-        cls="grid",
     )
-def LoadingMessage():
     return Div(
         Lucide(icon="loader-circle", cls="size-5 mr-1.5 animate-spin"),
-        Span("Retrieving search results", cls="text-base text-center"),
         cls="p-10 text-muted-foreground flex items-center justify-center",
         id="loading-indicator",
     )
@@ -250,7 +256,7 @@ def SearchResult(results: list, query_id: Optional[str] = None):
     result_items = []
     for idx, result in enumerate(results):
         fields = result["fields"]  # Extract the 'fields' part of each result
-        full_image_base64 = f"data:image/jpeg;base64,{fields['full_image']}"
         # Filter sim_map fields that are words with 4 or more characters
         sim_map_fields = {
@@ -286,7 +292,7 @@ def SearchResult(results: list, query_id: Optional[str] = None):
             "Reset",
             variant="outline",
             size="sm",
-            data_image_src=full_image_base64,
             cls="reset-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
         )
@@ -312,7 +318,11 @@ def SearchResult(results: list, query_id: Optional[str] = None):
                     Div(
                         Div(
                             Img(
-                                src=full_image_base64,
                                 alt=fields["title"],
                                 cls="result-image w-full h-full object-contain",
                             ),
@@ -350,12 +360,35 @@ def SearchResult(results: list, query_id: Optional[str] = None):
                     ),
                     cls="bg-background px-3 py-5 hidden md:block",
                 ),
-                cls="grid grid-cols-1 md:grid-cols-2 col-span-2",
             )
         )
     return Div(
         *result_items,
         image_swapping,
         id="search-results",
         cls="grid grid-cols-2 gap-px bg-border",
     )

 def SampleQueries():
     sample_queries = [
+        "Proportion of female new hires 2021-2023?",
+        "Total amount of performance-based pay awarded in 2023?",
+        "What is the percentage distribution of employees with performance-based pay relative to the limit in 2023?",
+        "What is the breakdown of management costs by investment strategy in 2023?",
+        "2023 profit loss portfolio",
+        "net cash flow operating activities",
+        "fund currency basket returns",
     ]
     query_badges = []
     )
     return Div(
         Div(
             Div(
+                SearchBox(query_value=query_value, ranking_value=ranking_value),
+                Div(
+                    LoadingMessage(),
+                    id="search-results",  # This will be replaced by the search results
+                ),
+                cls="grid",
             ),
             cls="grid",
         ),
     )
+def LoadingMessage(display_text="Retrieving search results"):
     return Div(
         Lucide(icon="loader-circle", cls="size-5 mr-1.5 animate-spin"),
+        Span(display_text, cls="text-base text-center"),
         cls="p-10 text-muted-foreground flex items-center justify-center",
         id="loading-indicator",
     )
     result_items = []
     for idx, result in enumerate(results):
         fields = result["fields"]  # Extract the 'fields' part of each result
+        blur_image_base64 = f"data:image/jpeg;base64,{fields['blur_image']}"
         # Filter sim_map fields that are words with 4 or more characters
         sim_map_fields = {
             "Reset",
             variant="outline",
             size="sm",
+            data_image_src=blur_image_base64,
             cls="reset-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
         )
                     Div(
                         Div(
                             Img(
+                                src=blur_image_base64,
+                                hx_get=f"/full_image?id={fields['id']}",
+                                style="filter: blur(5px);",
+                                hx_trigger="load",
+                                hx_swap="outerHTML",
                                 alt=fields["title"],
                                 cls="result-image w-full h-full object-contain",
                             ),
                     ),
                     cls="bg-background px-3 py-5 hidden md:block",
                 ),
+                cls="grid grid-cols-1 md:grid-cols-2 col-span-2 border-t",
             )
         )
     return Div(
         *result_items,
         image_swapping,
         id="search-results",
         cls="grid grid-cols-2 gap-px bg-border",
     )
+def ChatResult(query_id: str, query: str):
+    return Div(
+        Div("Chat", cls="text-xl font-semibold p-3"),
+        Div(
+            Div(
+                Div(
+                    LoadingMessage(display_text="Waiting for response..."),
+                    cls="bg-muted/80 dark:bg-muted/40 text-black dark:text-white p-2 rounded-md",
+                    hx_ext="sse",
+                    sse_connect=f"/get-message?query_id={query_id}&query={quote_plus(query)}",
+                    sse_swap="message",
+                    sse_close="close",
+                    hx_swap="innerHTML",
+                ),
+            ),
+            id="chat-messages",
+            cls="overflow-auto min-h-0 grid items-end px-3",
+        ),
+        cls="h-full grid grid-rows-[auto_1fr_auto] min-h-0 gap-3",
+    )

frontend/layout.py CHANGED Viewed

@@ -1,15 +1,96 @@
-from fasthtml.components import Div, Img, Nav, Title, Body, Header, Main
-from fasthtml.xtend import A
 from lucide_fasthtml import Lucide
 from shad4fast import Button, Separator
 def Logo():
     return Div(
-        Img(src='https://assets.vespa.ai/logos/vespa-logo-black.svg', alt='Vespa Logo', cls='h-full dark:hidden'),
-        Img(src='https://assets.vespa.ai/logos/vespa-logo-white.svg', alt='Vespa Logo Dark Mode',
-            cls='h-full hidden dark:block'),
-        cls='h-[27px]'
     )
@@ -38,23 +119,23 @@ def Links():
         ),
         Separator(orientation="vertical"),
         ThemeToggle(),
-        cls='flex items-center space-x-3'
     )
 def Layout(*c, **kwargs):
     return (
-        Title('Visual Retrieval ColPali'),
         Body(
             Header(
                 A(Logo(), href="/"),
                 Links(),
-                cls='min-h-[55px] h-[55px] w-full flex items-center justify-between px-4'
-            ),
-            Main(
-                *c, **kwargs,
-                cls='flex-1 h-full'
             ),
-            cls='h-full flex flex-col'
         ),
     )

+from fasthtml.components import Body, Div, Header, Img, Nav, Title
+from fasthtml.xtend import A, Script
 from lucide_fasthtml import Lucide
 from shad4fast import Button, Separator
+script = Script(
+    """
+    document.addEventListener("DOMContentLoaded", function () {
+          const main = document.querySelector('main');
+          const aside = document.querySelector('aside');
+          const body = document.body;
+          if (main && aside && main.nextElementSibling === aside) {
+            // Main + Aside layout
+            body.classList.add('grid-cols-[minmax(0,_4fr)_minmax(0,_1fr)]');
+            aside.classList.remove('hidden');
+          } else if (main) {
+            // Only Main layout (full width)
+            body.classList.add('grid-cols-[1fr]');
+          }
+    });
+    """
+)
+overlay_scrollbars = Script(
+    """
+    (function () {
+        const { OverlayScrollbars } = OverlayScrollbarsGlobal;
+        function getPreferredTheme() {
+            return localStorage.theme === 'dark' || (!('theme' in localStorage) && window.matchMedia('(prefers-color-scheme: dark)').matches)
+                ? 'dark'
+                : 'light';
+        }
+        function applyOverlayScrollbars(element, scrollbarTheme) {
+            // Destroy existing OverlayScrollbars instance if it exists
+            const instance = OverlayScrollbars(element);
+            if (instance) {
+                instance.destroy();
+            }
+            // Reinitialize OverlayScrollbars with the new theme
+            OverlayScrollbars(element, {
+                scrollbars: {
+                    theme: scrollbarTheme,
+                    visibility: 'auto',
+                    autoHide: 'leave',
+                    autoHideDelay: 800
+                }
+            });
+        }
+        function updateScrollbarTheme() {
+            const isDarkMode = getPreferredTheme() === 'dark';
+            const scrollbarTheme = isDarkMode ? 'os-theme-light' : 'os-theme-dark';  // Light theme in dark mode, dark theme in light mode
+            const mainElement = document.querySelector('main');
+            const chatMessagesElement = document.querySelector('#chat-messages'); // Select the chat message container by ID
+            if (mainElement) {
+                applyOverlayScrollbars(mainElement, scrollbarTheme);
+            }
+            if (chatMessagesElement) {
+                applyOverlayScrollbars(chatMessagesElement, scrollbarTheme);
+            }
+        }
+        // Apply the correct theme immediately when the page loads
+        updateScrollbarTheme();
+        // Observe changes in the 'dark' class on the <html> element
+        const observer = new MutationObserver(updateScrollbarTheme);
+        observer.observe(document.documentElement, { attributes: true, attributeFilter: ['class'] });
+    })();
+    """
+)
 def Logo():
     return Div(
+        Img(
+            src="https://assets.vespa.ai/logos/vespa-logo-black.svg",
+            alt="Vespa Logo",
+            cls="h-full dark:hidden",
+        ),
+        Img(
+            src="https://assets.vespa.ai/logos/vespa-logo-white.svg",
+            alt="Vespa Logo Dark Mode",
+            cls="h-full hidden dark:block",
+        ),
+        cls="h-[27px]",
     )
         ),
         Separator(orientation="vertical"),
         ThemeToggle(),
+        cls="flex items-center space-x-3",
     )
 def Layout(*c, **kwargs):
     return (
+        Title("Visual Retrieval ColPali"),
         Body(
             Header(
                 A(Logo(), href="/"),
                 Links(),
+                cls="min-h-[55px] h-[55px] w-full flex items-center justify-between px-4",
             ),
+            *c,
+            **kwargs,
+            cls="grid grid-rows-[55px_1fr] min-h-0",
         ),
+        script,
+        overlay_scrollbars,
     )

globals.css CHANGED Viewed

@@ -183,4 +183,25 @@
     width: 100%;
     height: 100%;
     z-index: 10;
-}

     width: 100%;
     height: 100%;
     z-index: 10;
+}
+header {
+    grid-column: 1/-1;
+}
+main {
+    overflow: auto;
+}
+aside {
+    overflow: auto;
+}
+.scroll-container {
+    padding-right: 10px;
+}
+.question-message {
+    background-color: #61D790;
+    color: #2E2F27;
+}

main.py CHANGED Viewed

@@ -1,22 +1,25 @@
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from fasthtml.common import *
 from shad4fast import *
 from vespa.application import Vespa
-import time
 from backend.colpali import (
-    get_result_from_query,
-    get_query_embeddings_and_token_map,
     add_sim_maps_to_result,
     is_special_token,
 )
-from backend.vespa_app import get_vespa_app
-from backend.cache import LRUCache
 from backend.modelmanager import ModelManager
 from frontend.app import (
     Home,
     Search,
     SearchBox,
@@ -25,7 +28,10 @@ from frontend.app import (
     SimMapButtonReady,
 )
 from frontend.layout import Layout
-import hashlib
 highlight_js_theme_link = Link(id="highlight-theme", rel="stylesheet", href="")
 highlight_js_theme = Script(src="/static/js/highlightjs-theme.js")
@@ -35,15 +41,27 @@ highlight_js = HighlightJS(
     light="github",
 )
 app, rt = fast_app(
-    htmlkw={"cls": "h-full"},
     pico=False,
     hdrs=(
         ShadHead(tw_cdn=False, theme_handle=True),
         highlight_js,
         highlight_js_theme_link,
         highlight_js_theme,
     ),
 )
 vespa_app: Vespa = get_vespa_app()
@@ -53,6 +71,16 @@ task_cache = LRUCache(
     max_size=1000
 )  # Map from query_id to boolean value - False if not all results are ready.
 thread_pool = ThreadPoolExecutor()
 @app.on_event("startup")
@@ -72,7 +100,7 @@ def serve_static(filepath: str):
 @rt("/")
 def get():
-    return Layout(Home())
 @rt("/search")
@@ -86,16 +114,18 @@ def get(request):
     if not query_value:
         # Show SearchBox and a message for missing query
         return Layout(
-            Div(
-                SearchBox(query_value=query_value, ranking_value=ranking_value),
                 Div(
-                    P(
-                        "No query provided. Please enter a query.",
-                        cls="text-center text-muted-foreground",
                     ),
-                    cls="p-10",
-                ),
-                cls="grid",
             )
         )
     # Generate a unique query_id based on the query and ranking value
@@ -107,7 +137,12 @@ def get(request):
     #     search_results = get_results_children(result)
     #     return Layout(Search(request, search_results))
     # Show the loading message if a query is provided
-    return Layout(Search(request))  # Show SearchBox and Loading message initially
 @rt("/fetch_results")
@@ -215,15 +250,67 @@ async def get_sim_map(query_id: str, idx: int, token: str):
         sim_map_b64 = search_results[idx]["fields"].get(sim_map_key, None)
         if sim_map_b64 is None:
             return SimMapButtonPoll(query_id=query_id, idx=idx, token=token)
-        sim_map_img_src = f"data:image/jpeg;base64,{sim_map_b64}"
         return SimMapButtonReady(
             query_id=query_id, idx=idx, token=token, img_src=sim_map_img_src
         )
 @rt("/app")
 def get():
-    return Layout(Div(P(f"Connected to Vespa at {vespa_app.url}"), cls="p-4"))
 if __name__ == "__main__":

 import asyncio
+import hashlib
+import time
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from fasthtml.common import *
 from shad4fast import *
 from vespa.application import Vespa
+from backend.cache import LRUCache
 from backend.colpali import (
     add_sim_maps_to_result,
+    get_query_embeddings_and_token_map,
+    get_result_from_query,
     is_special_token,
+    get_full_image_from_vespa,
 )
 from backend.modelmanager import ModelManager
+from backend.vespa_app import get_vespa_app
 from frontend.app import (
+    ChatResult,
     Home,
     Search,
     SearchBox,
     SimMapButtonReady,
 )
 from frontend.layout import Layout
+import google.generativeai as genai
+from PIL import Image
+import io
+import base64
 highlight_js_theme_link = Link(id="highlight-theme", rel="stylesheet", href="")
 highlight_js_theme = Script(src="/static/js/highlightjs-theme.js")
     light="github",
 )
+overlayscrollbars_link = Link(
+    rel="stylesheet",
+    href="https://cdnjs.cloudflare.com/ajax/libs/overlayscrollbars/2.10.0/styles/overlayscrollbars.min.css",
+    type="text/css",
+)
+overlayscrollbars_js = Script(
+    src="https://cdnjs.cloudflare.com/ajax/libs/overlayscrollbars/2.10.0/browser/overlayscrollbars.browser.es5.min.js"
+)
+sselink = Script(src="https://unpkg.com/htmx-ext-sse@2.2.1/sse.js")
 app, rt = fast_app(
+    htmlkw={"cls": "grid h-full"},
     pico=False,
     hdrs=(
         ShadHead(tw_cdn=False, theme_handle=True),
         highlight_js,
         highlight_js_theme_link,
         highlight_js_theme,
+        overlayscrollbars_link,
+        overlayscrollbars_js,
+        sselink,
     ),
 )
 vespa_app: Vespa = get_vespa_app()
     max_size=1000
 )  # Map from query_id to boolean value - False if not all results are ready.
 thread_pool = ThreadPoolExecutor()
+# Gemini config
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+GEMINI_SYSTEM_PROMPT = """If the user query is a question, try your best to answer it based on the provided images.
+If the user query is not an obvious question, reply with 'No question detected.'. Your response should be HTML formatted.
+This means that newlines will be replaced with <br> tags, bold text will be enclosed in <b> tags, and so on.
+"""
+gemini_model = genai.GenerativeModel(
+    "gemini-1.5-flash-8b", system_instruction=GEMINI_SYSTEM_PROMPT
+)
 @app.on_event("startup")
 @rt("/")
 def get():
+    return Layout(Main(Home()))
 @rt("/search")
     if not query_value:
         # Show SearchBox and a message for missing query
         return Layout(
+            Main(
                 Div(
+                    SearchBox(query_value=query_value, ranking_value=ranking_value),
+                    Div(
+                        P(
+                            "No query provided. Please enter a query.",
+                            cls="text-center text-muted-foreground",
+                        ),
+                        cls="p-10",
                     ),
+                    cls="grid",
+                )
             )
         )
     # Generate a unique query_id based on the query and ranking value
     #     search_results = get_results_children(result)
     #     return Layout(Search(request, search_results))
     # Show the loading message if a query is provided
+    return Layout(
+        Main(Search(request), data_overlayscrollbars_initialize=True, cls="border-t"),
+        Aside(
+            ChatResult(query_id=query_id, query=query_value), cls="border-t border-l"
+        ),
+    )  # Show SearchBox and Loading message initially
 @rt("/fetch_results")
         sim_map_b64 = search_results[idx]["fields"].get(sim_map_key, None)
         if sim_map_b64 is None:
             return SimMapButtonPoll(query_id=query_id, idx=idx, token=token)
+        sim_map_img_src = f"data:image/png;base64,{sim_map_b64}"
         return SimMapButtonReady(
             query_id=query_id, idx=idx, token=token, img_src=sim_map_img_src
         )
+@app.get("/full_image")
+async def full_image(id: str):
+    """
+    Endpoint to get the full quality image for a given result id.
+    """
+    image_data = await get_full_image_from_vespa(vespa_app, id)
+    # Decode the base64 image data
+    # image_data = base64.b64decode(image_data)
+    image_data = "data:image/jpeg;base64," + image_data
+    return Img(
+        src=image_data,
+        alt="something",
+        cls="result-image w-full h-full object-contain",
+    )
+async def message_generator(query_id: str, query: str):
+    result = None
+    while result is None:
+        result = result_cache.get(query_id)
+        await asyncio.sleep(0.5)
+    search_results = get_results_children(result)
+    images = [result["fields"]["blur_image"] for result in search_results]
+    # from b64 to PIL image
+    images = [Image.open(io.BytesIO(base64.b64decode(img))) for img in images]
+    # If newlines are present in the response, the connection will be closed.
+    def replace_newline_with_br(text):
+        return text.replace("\n", "<br>")
+    response_text = ""
+    async for chunk in await gemini_model.generate_content_async(
+        images + ["\n\n Query: ", query], stream=True
+    ):
+        if chunk.text:
+            response_text += chunk.text
+            response_text = replace_newline_with_br(response_text)
+            yield f"event: message\ndata: {response_text}\n\n"
+            await asyncio.sleep(0.5)
+    yield "event: close\ndata: \n\n"
+@app.get("/get-message")
+async def get_message(query_id: str, query: str):
+    return StreamingResponse(
+        message_generator(query_id=query_id, query=query),
+        media_type="text/event-stream",
+    )
 @rt("/app")
 def get():
+    return Layout(Main(Div(P(f"Connected to Vespa at {vespa_app.url}"), cls="p-4")))
 if __name__ == "__main__":

output.css CHANGED Viewed

@@ -927,6 +927,10 @@ body {
   max-height: 100vh;
 }
 .min-h-\[55px\] {
   min-height: 55px;
 }
@@ -1096,6 +1100,22 @@ body {
   grid-template-columns: repeat(2, minmax(0, 1fr));
 }
 .flex-col {
   flex-direction: column;
 }
@@ -1112,10 +1132,18 @@ body {
   align-content: flex-start;
 }
 .items-center {
   align-items: center;
 }
 .justify-center {
   justify-content: center;
 }
@@ -1136,6 +1164,10 @@ body {
   gap: 0.5rem;
 }
 .gap-4 {
   gap: 1rem;
 }
@@ -1200,6 +1232,10 @@ body {
   margin-bottom: calc(0.5rem * var(--tw-space-y-reverse));
 }
 .self-stretch {
   align-self: stretch;
 }
@@ -1252,6 +1288,11 @@ body {
   border-width: 2px;
 }
 .border-b {
   border-bottom-width: 1px;
 }
@@ -1493,6 +1534,10 @@ body {
   padding-top: 1rem;
 }
 .text-left {
   text-align: left;
 }
@@ -1577,6 +1622,11 @@ body {
   letter-spacing: 0.025em;
 }
 .text-card-foreground {
   color: hsl(var(--card-foreground));
 }
@@ -1993,6 +2043,27 @@ body {
   z-index: 10;
 }
 :root:has(.data-\[state\=open\]\:no-bg-scroll[data-state="open"]) {
   overflow: hidden;
 }
@@ -2537,6 +2608,11 @@ body {
   --tw-gradient-to: #d1d5db var(--tw-gradient-to-position);
 }
 .dark\:hover\:border-white:hover:where(.dark, .dark *) {
   --tw-border-opacity: 1;
   border-color: rgb(255 255 255 / var(--tw-border-opacity));
@@ -2610,4 +2686,4 @@ body {
 .\[\&_tr\]\:border-b tr {
   border-bottom-width: 1px;
-}

   max-height: 100vh;
 }
+.min-h-0 {
+  min-height: 0px;
+}
 .min-h-\[55px\] {
   min-height: 55px;
 }
   grid-template-columns: repeat(2, minmax(0, 1fr));
 }
+.grid-cols-\[1fr\] {
+  grid-template-columns: 1fr;
+}
+.grid-cols-\[minmax\(0\2c _4fr\)_minmax\(0\2c _1fr\)\] {
+  grid-template-columns: minmax(0, 4fr) minmax(0, 1fr);
+}
+.grid-rows-\[55px_1fr\] {
+  grid-template-rows: 55px 1fr;
+}
+.grid-rows-\[auto_1fr_auto\] {
+  grid-template-rows: auto 1fr auto;
+}
 .flex-col {
   flex-direction: column;
 }
   align-content: flex-start;
 }
+.items-end {
+  align-items: flex-end;
+}
 .items-center {
   align-items: center;
 }
+.justify-end {
+  justify-content: flex-end;
+}
 .justify-center {
   justify-content: center;
 }
   gap: 0.5rem;
 }
+.gap-3 {
+  gap: 0.75rem;
+}
 .gap-4 {
   gap: 1rem;
 }
   margin-bottom: calc(0.5rem * var(--tw-space-y-reverse));
 }
+.self-end {
+  align-self: flex-end;
+}
 .self-stretch {
   align-self: stretch;
 }
   border-width: 2px;
 }
+.border-x {
+  border-left-width: 1px;
+  border-right-width: 1px;
+}
 .border-b {
   border-bottom-width: 1px;
 }
   padding-top: 1rem;
 }
+.pr-3 {
+  padding-right: 0.75rem;
+}
 .text-left {
   text-align: left;
 }
   letter-spacing: 0.025em;
 }
+.text-black {
+  --tw-text-opacity: 1;
+  color: rgb(0 0 0 / var(--tw-text-opacity));
+}
 .text-card-foreground {
   color: hsl(var(--card-foreground));
 }
   z-index: 10;
 }
+header {
+  grid-column: 1/-1;
+}
+main {
+  overflow: auto;
+}
+aside {
+  overflow: auto;
+}
+.scroll-container {
+  padding-right: 10px;
+}
+.question-message {
+  background-color: #61D790;
+  color: #2E2F27;
+}
 :root:has(.data-\[state\=open\]\:no-bg-scroll[data-state="open"]) {
   overflow: hidden;
 }
   --tw-gradient-to: #d1d5db var(--tw-gradient-to-position);
 }
+.dark\:text-white:where(.dark, .dark *) {
+  --tw-text-opacity: 1;
+  color: rgb(255 255 255 / var(--tw-text-opacity));
+}
 .dark\:hover\:border-white:hover:where(.dark, .dark *) {
   --tw-border-opacity: 1;
   border-color: rgb(255 255 255 / var(--tw-border-opacity));
 .\[\&_tr\]\:border-b tr {
   border-bottom-width: 1px;
+}

prepare_feed_deploy.py ADDED Viewed

	@@ -0,0 +1,977 @@

+# %% [markdown]
+# # Visual PDF Retrieval - demo application
+#
+# In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
+# We will use ColPali as the model to extract patch vectors from images of pdf pages.
+# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
+#
+# To see the application in action, visit TODO:
+#
+# The web application is written in FastHTML, meaning the complete application is written in python.
+#
+# The steps we will take in this notebook are:
+#
+# 0. Setup and configuration
+# 1. Download the data
+# 2. Prepare the data
+# 3. Generate queries for evaluation and typeahead search suggestions
+# 4. Deploy the Vespa application
+# 5. Create the Vespa application
+# 6. Feed the data to the Vespa application
+#
+# All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
+# We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
+#
+# %% [markdown]
+# ## 0. Setup and Configuration
+#
+# %%
+import os
+import asyncio
+import json
+from typing import Tuple
+import hashlib
+import numpy as np
+# Vespa
+from vespa.package import (
+    ApplicationPackage,
+    Field,
+    Schema,
+    Document,
+    HNSW,
+    RankProfile,
+    Function,
+    FieldSet,
+    SecondPhaseRanking,
+    Summary,
+    DocumentSummary,
+)
+from vespa.deployment import VespaCloud
+from vespa.application import Vespa
+from vespa.io import VespaResponse
+# Google Generative AI
+import google.generativeai as genai
+# Torch and other ML libraries
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from pdf2image import convert_from_path
+from pypdf import PdfReader
+# ColPali model and processor
+from colpali_engine.models import ColPali, ColPaliProcessor
+from colpali_engine.utils.torch_utils import get_torch_device
+from vidore_benchmark.utils.image_utils import scale_image, get_base64_image
+# Other utilities
+from bs4 import BeautifulSoup
+import httpx
+from urllib.parse import urljoin, urlparse
+# Load environment variables
+from dotenv import load_dotenv
+load_dotenv()
+# Avoid warning from huggingface tokenizers
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# %% [markdown]
+# ### Create a free trial in Vespa Cloud
+#
+# Create a tenant from [here](https://vespa.ai/free-trial/).
+# The trial includes $300 credit.
+# Take note of your tenant name.
+#
+# %%
+VESPA_TENANT_NAME = "vespa-team"
+# %% [markdown]
+# Here, set your desired application name. (Will be created in later steps)
+# Note that you can not have hyphen `-` or underscore `_` in the application name.
+#
+# %%
+VESPA_APPLICATION_NAME = "colpalidemo2"
+VESPA_SCHEMA_NAME = "pdf_page"
+# %% [markdown]
+# Next, you need to create some tokens for feeding data, and querying the application.
+# We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
+# The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
+#
+# %%
+VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
+VESPA_TOKEN_ID_READ = "colpalidemo_read"
+# %% [markdown]
+# We also need to set the value of the write token to be able to feed data to the Vespa application.
+#
+# %%
+VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
+    "Enter Vespa cloud secret token: "
+)
+# %% [markdown]
+# We will also use the Gemini API to create sample queries for our images.
+# You can also use other VLM's to create these queries.
+# Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
+#
+# %%
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
+    "Enter Google Generative AI API key: "
+)
+# %%
+MODEL_NAME = "vidore/colpali-v1.2"
+# Configure Google Generative AI
+genai.configure(api_key=GEMINI_API_KEY)
+# Set device for Torch
+device = get_torch_device("auto")
+print(f"Using device: {device}")
+# Load the ColPali model and processor
+model = ColPali.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map=device,
+).eval()
+processor = ColPaliProcessor.from_pretrained(MODEL_NAME)
+# %% [markdown]
+# ## 1. Download PDFs
+#
+# We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
+# The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
+#
+# These reports are the ones we are going to use for this showcase.
+# Here are some sample images:
+#
+# ![Sample1](./static/img/gfpg-sample-1.png)
+# ![Sample2](./static/img/gfpg-sample-2.png)
+#
+# %% [markdown]
+# As we can see, a lot of the information is in the form of tables, charts and numbers.
+# These are not easily extractable using pdf-readers or OCR tools.
+#
+# %%
+import requests
+url = "https://www.nbim.no/en/publications/reports/"
+response = requests.get(url)
+response.raise_for_status()
+html_content = response.text
+# Parse with BeautifulSoup
+soup = BeautifulSoup(html_content, "html.parser")
+links = []
+# Find all <a> elements with the specific classes
+for a_tag in soup.find_all("a", href=True):
+    classes = a_tag.get("class", [])
+    if "button" in classes and "button--download-secondary" in classes:
+        href = a_tag["href"]
+        full_url = urljoin(url, href)
+        links.append(full_url)
+links
+# %%
+# Limit the number of PDFs to download
+NUM_PDFS = 2  # Set to None to download all PDFs
+links = links[:NUM_PDFS] if NUM_PDFS else links
+links
+# %%
+from nest_asyncio import apply
+from typing import List
+apply()
+max_attempts = 3
+async def download_pdf(session, url, filename):
+    attempt = 0
+    while attempt < max_attempts:
+        try:
+            response = await session.get(url)
+            response.raise_for_status()
+            # Use Content-Disposition header to get the filename if available
+            content_disposition = response.headers.get("Content-Disposition")
+            if content_disposition:
+                import re
+                fname = re.findall('filename="(.+)"', content_disposition)
+                if fname:
+                    filename = fname[0]
+            # Ensure the filename is safe to use on the filesystem
+            safe_filename = filename.replace("/", "_").replace("\\", "_")
+            if not safe_filename or safe_filename == "_":
+                print(f"Invalid filename: {filename}")
+                continue
+            filepath = os.path.join("pdfs", safe_filename)
+            with open(filepath, "wb") as f:
+                f.write(response.content)
+            print(f"Downloaded {safe_filename}")
+            return filepath
+        except Exception as e:
+            print(f"Error downloading {filename}: {e}")
+            print(f"Retrying ({attempt})...")
+            await asyncio.sleep(1)  # Wait a bit before retrying
+            attempt += 1
+    return None
+async def download_pdfs(links: List[str]) -> List[dict]:
+    """Download PDFs from a list of URLs. Add the filename to the dictionary."""
+    async with httpx.AsyncClient() as client:
+        tasks = []
+        for idx, link in enumerate(links):
+            # Try to get the filename from the URL
+            path = urlparse(link).path
+            filename = os.path.basename(path)
+            # If filename is empty,skip
+            if not filename:
+                continue
+            tasks.append(download_pdf(client, link, filename))
+        # Run the tasks concurrently
+        paths = await asyncio.gather(*tasks)
+        pdf_files = [
+            {"url": link, "path": path} for link, path in zip(links, paths) if path
+        ]
+        return pdf_files
+# Create the pdfs directory if it doesn't exist
+os.makedirs("pdfs", exist_ok=True)
+# Now run the download_pdfs function with the URL
+pdfs = asyncio.run(download_pdfs(links))
+# %%
+pdfs
+# %% [markdown]
+# ## 2. Convert PDFs to Images
+#
+# %%
+def get_pdf_images(pdf_path):
+    reader = PdfReader(pdf_path)
+    page_texts = []
+    for page_number in range(len(reader.pages)):
+        page = reader.pages[page_number]
+        text = page.extract_text()
+        page_texts.append(text)
+    images = convert_from_path(pdf_path)
+    # Convert to PIL images
+    assert len(images) == len(page_texts)
+    return images, page_texts
+pdf_folder = "pdfs"
+pdf_pages = []
+for pdf in tqdm(pdfs):
+    pdf_file = pdf["path"]
+    title = os.path.splitext(os.path.basename(pdf_file))[0]
+    images, texts = get_pdf_images(pdf_file)
+    for page_no, (image, text) in enumerate(zip(images, texts)):
+        pdf_pages.append(
+            {
+                "title": title,
+                "url": pdf["url"],
+                "path": pdf_file,
+                "image": image,
+                "text": text,
+                "page_no": page_no,
+            }
+        )
+# %%
+len(pdf_pages)
+# %%
+from collections import Counter
+# Print the length of the text fields - mean, max and min
+text_lengths = [len(page["text"]) for page in pdf_pages]
+print(f"Mean text length: {np.mean(text_lengths)}")
+print(f"Max text length: {np.max(text_lengths)}")
+print(f"Min text length: {np.min(text_lengths)}")
+print(f"Median text length: {np.median(text_lengths)}")
+print(f"Number of text with length == 0: {Counter(text_lengths)[0]}")
+# %% [markdown]
+# ## 3. Generate Queries
+#
+# In this step, we want to generate queries for each page image.
+# These will be useful for 2 reasons:
+#
+# 1. We can use these queries as typeahead suggestions in the search bar.
+# 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
+#
+# The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
+#
+# We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
+#
+# %%
+from pydantic import BaseModel
+class GeneratedQueries(BaseModel):
+    broad_topical_question: str
+    broad_topical_query: str
+    specific_detail_question: str
+    specific_detail_query: str
+    visual_element_question: str
+    visual_element_query: str
+def get_retrieval_prompt() -> Tuple[str, GeneratedQueries]:
+    prompt = (
+        prompt
+    ) = """You are an investor, stock analyst and financial expert. You will be presented an image of a document page from a report published by the Norwegian Government Pension Fund Global (GPFG). The report may be annual or quarterly reports, or policy reports, on topics such as responsible investment, risk etc.
+Your task is to generate retrieval queries and questions that you would use to retrieve this document (or ask based on this document) in a large corpus.
+Please generate 3 different types of retrieval queries and questions.
+A retrieval query is a keyword based query, made up of 2-5 words, that you would type into a search engine to find this document.
+A question is a natural language question that you would ask, for which the document contains the answer.
+The queries should be of the following types:
+1. A broad topical query: This should cover the main subject of the document.
+2. A specific detail query: This should cover a specific detail or aspect of the document.
+3. A visual element query: This should cover a visual element of the document, such as a chart, graph, or image.
+Important guidelines:
+- Ensure the queries are relevant for retrieval tasks, not just describing the page content.
+- Use a fact-based natural language style for the questions.
+- Frame the queries as if someone is searching for this document in a large corpus.
+- Make the queries diverse and representative of different search strategies.
+Format your response as a JSON object with the structure of the following example:
+{
+    "broad_topical_question": "What was the Responsible Investment Policy in 2019?",
+    "broad_topical_query": "responsible investment policy 2019",
+    "specific_detail_question": "What is the percentage of investments in renewable energy?",
+    "specific_detail_query": "renewable energy investments percentage",
+    "visual_element_question": "What is the trend of total holding value over time?",
+    "visual_element_query": "total holding value trend"
+}
+If there are no relevant visual elements, provide an empty string for the visual element question and query.
+Here is the document image to analyze:
+Generate the queries based on this image and provide the response in the specified JSON format.
+Only return JSON. Don't return any extra explanation text. """
+    return prompt, GeneratedQueries
+prompt_text, pydantic_model = get_retrieval_prompt()
+# %%
+gemini_model = genai.GenerativeModel("gemini-1.5-flash-8b")
+def generate_queries(image, prompt_text, pydantic_model):
+    try:
+        response = gemini_model.generate_content(
+            [image, "\n\n", prompt_text],
+            generation_config=genai.GenerationConfig(
+                response_mime_type="application/json",
+                response_schema=pydantic_model,
+            ),
+        )
+        queries = json.loads(response.text)
+    except Exception as _e:
+        queries = {
+            "broad_topical_question": "",
+            "broad_topical_query": "",
+            "specific_detail_question": "",
+            "specific_detail_query": "",
+            "visual_element_question": "",
+            "visual_element_query": "",
+        }
+    return queries
+# %%
+for pdf in tqdm(pdf_pages):
+    image = pdf.get("image")
+    pdf["queries"] = generate_queries(image, prompt_text, pydantic_model)
+# %%
+pdf_pages[46]["image"]
+# %%
+pdf_pages[46]["queries"]
+# %%
+# Generate queries async - keeping for now as we probably need when applying to the full dataset
+# import asyncio
+# from tenacity import retry, stop_after_attempt, wait_exponential
+# import google.generativeai as genai
+# from tqdm.asyncio import tqdm_asyncio
+# max_in_flight = 200  # Maximum number of concurrent requests
+# async def generate_queries_for_image_async(model, image, semaphore):
+#     @retry(stop=stop_after_attempt(3), wait=wait_exponential(), reraise=True)
+#     async def _generate():
+#         async with semaphore:
+#             result = await model.generate_content_async(
+#                 [image, "\n\n", prompt_text],
+#                 generation_config=genai.GenerationConfig(
+#                     response_mime_type="application/json",
+#                     response_schema=pydantic_model,
+#                 ),
+#             )
+#             return json.loads(result.text)
+#     try:
+#         return await _generate()
+#     except Exception as e:
+#         print(f"Error generating queries for image: {e}")
+#         return None  # Return None or handle as needed
+# async def enrich_pdfs():
+#     gemini_model = genai.GenerativeModel("gemini-1.5-flash-8b")
+#     semaphore = asyncio.Semaphore(max_in_flight)
+#     tasks = []
+#     for pdf in pdf_pages:
+#         pdf["queries"] = []
+#         image = pdf.get("image")
+#         if image:
+#             task = generate_queries_for_image_async(gemini_model, image, semaphore)
+#             tasks.append((pdf, task))
+#     # Run the tasks concurrently using asyncio.gather()
+#     for pdf, task in tqdm_asyncio(tasks):
+#         result = await task
+#         if result:
+#             pdf["queries"] = result
+#     return pdf_pages
+# pdf_pages = asyncio.run(enrich_pdfs())
+# %%
+# write title, url, page_no, text, queries, not image to JSON
+with open("output/pdf_pages.json", "w") as f:
+    to_write = [{k: v for k, v in pdf.items() if k != "image"} for pdf in pdf_pages]
+    json.dump(to_write, f, indent=2)
+# with open("pdfs/pdf_pages.json", "r") as f:
+#     saved_pdf_pages = json.load(f)
+# for pdf, saved_pdf in zip(pdf_pages, saved_pdf_pages):
+#     pdf.update(saved_pdf)
+# %% [markdown]
+# ## 4. Generate embeddings
+#
+# Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
+#
+# %%
+def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
+    """
+    Generate embeddings for a list of images.
+    Move to CPU only once per batch.
+    Args:
+        images (List[PIL.Image]): List of PIL images.
+        model (nn.Module): The model to generate embeddings.
+        processor: The processor to preprocess images.
+        batch_size (int, optional): Batch size for processing. Defaults to 64.
+    Returns:
+        np.ndarray: Embeddings for the images, shape
+                    (len(images), processor.max_patch_length (1030 for ColPali), model.config.hidden_size (Patch embedding dimension - 128 for ColPali)).
+    """
+    embeddings_list = []
+    def collate_fn(batch):
+        # Batch is a list of images
+        return processor.process_images(batch)  # Should return a dict of tensors
+    dataloader = DataLoader(
+        images,
+        shuffle=False,
+        collate_fn=collate_fn,
+    )
+    for batch_doc in tqdm(dataloader, desc="Generating embeddings"):
+        with torch.no_grad():
+            # Move batch to the device
+            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
+            embeddings_batch = model(**batch_doc)
+            embeddings_list.append(torch.unbind(embeddings_batch.to("cpu"), dim=0))
+    # Concatenate all embeddings and create a numpy array
+    all_embeddings = np.concatenate(embeddings_list, axis=0)
+    return all_embeddings
+# %%
+# Generate embeddings for all images
+images = [pdf["image"] for pdf in pdf_pages]
+embeddings = generate_embeddings(images, model, processor)
+# %%
+embeddings.shape
+# %% [markdown]
+# ## 5. Prepare Data on Vespa Format
+#
+# Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
+#
+# %%
+def float_to_binary_embedding(float_query_embedding: dict) -> dict:
+    """Utility function to convert float query embeddings to binary query embeddings."""
+    binary_query_embeddings = {}
+    for k, v in float_query_embedding.items():
+        binary_vector = (
+            np.packbits(np.where(np.array(v) > 0, 1, 0)).astype(np.int8).tolist()
+        )
+        binary_query_embeddings[k] = binary_vector
+    return binary_query_embeddings
+# %%
+vespa_feed = []
+for pdf, embedding in zip(pdf_pages, embeddings):
+    url = pdf["url"]
+    title = pdf["title"]
+    image = pdf["image"]
+    text = pdf.get("text", "")
+    page_no = pdf["page_no"]
+    query_dict = pdf["queries"]
+    questions = [v for k, v in query_dict.items() if "question" in k and v]
+    queries = [v for k, v in query_dict.items() if "query" in k and v]
+    base_64_image = get_base64_image(
+        scale_image(image, 32), add_url_prefix=False
+    )  # Scaled down image to return fast on search (~1kb)
+    base_64_full_image = get_base64_image(image, add_url_prefix=False)
+    embedding_dict = {k: v for k, v in enumerate(embedding)}
+    binary_embedding = float_to_binary_embedding(embedding_dict)
+    # id_hash should be md5 hash of url and page_number
+    id_hash = hashlib.md5(f"{url}_{page_no}".encode()).hexdigest()
+    page = {
+        "id": id_hash,
+        "fields": {
+            "id": id_hash,
+            "url": url,
+            "title": title,
+            "page_number": page_no,
+            "blur_image": base_64_image,
+            "full_image": base_64_full_image,
+            "text": text,
+            "embedding": binary_embedding,
+            "queries": queries,
+            "questions": questions,
+        },
+    }
+    vespa_feed.append(page)
+# %%
+# We will prepare the Vespa feed data, including the embeddings and the generated queries
+# Save vespa_feed to vespa_feed.json
+os.makedirs("output", exist_ok=True)
+with open("output/vespa_feed.json", "w") as f:
+    vespa_feed_to_save = []
+    for page in vespa_feed:
+        document_id = page["id"]
+        put_id = f"id:{VESPA_APPLICATION_NAME}:{VESPA_SCHEMA_NAME}::{document_id}"
+        vespa_feed_to_save.append({"put": put_id, "fields": page["fields"]})
+    json.dump(vespa_feed_to_save, f)
+# %%
+# import json
+# with open("output/vespa_feed.json", "r") as f:
+#     vespa_feed = json.load(f)
+# %%
+len(vespa_feed)
+# %% [markdown]
+# ## 5. Prepare Vespa Application
+#
+# %%
+# Define the Vespa schema
+colpali_schema = Schema(
+    name=VESPA_SCHEMA_NAME,
+    document=Document(
+        fields=[
+            Field(
+                name="id",
+                type="string",
+                indexing=["summary", "index"],
+                match=["word"],
+            ),
+            Field(name="url", type="string", indexing=["summary", "index"]),
+            Field(
+                name="title",
+                type="string",
+                indexing=["summary", "index"],
+                match=["text"],
+                index="enable-bm25",
+            ),
+            Field(name="page_number", type="int", indexing=["summary", "attribute"]),
+            Field(name="blur_image", type="raw", indexing=["summary"]),
+            Field(name="full_image", type="raw", indexing=["summary"]),
+            Field(
+                name="text",
+                type="string",
+                indexing=["summary", "index"],
+                match=["text"],
+                index="enable-bm25",
+            ),
+            Field(
+                name="embedding",
+                type="tensor<int8>(patch{}, v[16])",
+                indexing=[
+                    "attribute",
+                    "index",
+                ],
+                ann=HNSW(
+                    distance_metric="hamming",
+                    max_links_per_node=32,
+                    neighbors_to_explore_at_insert=400,
+                ),
+            ),
+            Field(
+                name="questions",
+                type="array<string>",
+                indexing=["summary", "index", "attribute"],
+                index="enable-bm25",
+                stemming="best",
+            ),
+            Field(
+                name="queries",
+                type="array<string>",
+                indexing=["summary", "index", "attribute"],
+                index="enable-bm25",
+                stemming="best",
+            ),
+            # Add synthetic fields for the questions and queries
+            # Field(
+            #     name="questions_exact",
+            #     type="array<string>",
+            #     indexing=["input questions", "index", "attribute"],
+            #     match=["word"],
+            #     is_document_field=False,
+            # ),
+            # Field(
+            #     name="queries_exact",
+            #     type="array<string>",
+            #     indexing=["input queries", "index"],
+            #     match=["word"],
+            #     is_document_field=False,
+            # ),
+        ]
+    ),
+    fieldsets=[
+        FieldSet(
+            name="default",
+            fields=["title", "url", "blur_image", "page_number", "text"],
+        ),
+        FieldSet(
+            name="image",
+            fields=["full_image"],
+        ),
+    ],
+    document_summaries=[
+        DocumentSummary(
+            name="default",
+            summary_fields=[
+                Summary(
+                    name="text",
+                    fields=[("bolding", "on")],
+                ),
+                Summary(
+                    name="snippet",
+                    fields=[("source", "text"), "dynamic"],
+                ),
+            ],
+            from_disk=True,
+        ),
+    ],
+)
+# Define similarity functions used in all rank profiles
+mapfunctions = [
+    Function(
+        name="similarities",  # computes similarity scores between each query token and image patch
+        expression="""
+                sum(
+                    query(qt) * unpack_bits(attribute(embedding)), v
+                )
+            """,
+    ),
+    Function(
+        name="normalized",  # normalizes the similarity scores to [-1, 1]
+        expression="""
+                (similarities - reduce(similarities, min)) / (reduce((similarities - reduce(similarities, min)), max)) * 2 - 1
+            """,
+    ),
+    Function(
+        name="quantized",  # quantizes the normalized similarity scores to signed 8-bit integers [-128, 127]
+        expression="""
+                cell_cast(normalized * 127.999, int8)
+            """,
+    ),
+]
+# Define the 'bm25' rank profile
+colpali_bm25_profile = RankProfile(
+    name="bm25",
+    inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
+    first_phase="bm25(title) + bm25(text)",
+    functions=mapfunctions,
+    summary_features=["quantized"],
+)
+colpali_schema.add_rank_profile(colpali_bm25_profile)
+# Update the 'default' rank profile
+colpali_profile = RankProfile(
+    name="default",
+    inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
+    first_phase="bm25_score",
+    second_phase=SecondPhaseRanking(expression="max_sim", rerank_count=10),
+    functions=mapfunctions
+    + [
+        Function(
+            name="max_sim",
+            expression="""
+                sum(
+                    reduce(
+                        sum(
+                            query(qt) * unpack_bits(attribute(embedding)), v
+                        ),
+                        max, patch
+                    ),
+                    querytoken
+                )
+            """,
+        ),
+        Function(name="bm25_score", expression="bm25(title) + bm25(text)"),
+    ],
+    summary_features=["quantized"],
+)
+colpali_schema.add_rank_profile(colpali_profile)
+# Update the 'retrieval-and-rerank' rank profile
+input_query_tensors = []
+MAX_QUERY_TERMS = 64
+for i in range(MAX_QUERY_TERMS):
+    input_query_tensors.append((f"query(rq{i})", "tensor<int8>(v[16])"))
+input_query_tensors.extend(
+    [
+        ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
+        ("query(qtb)", "tensor<int8>(querytoken{}, v[16])"),
+    ]
+)
+colpali_retrieval_profile = RankProfile(
+    name="retrieval-and-rerank",
+    inputs=input_query_tensors,
+    first_phase="max_sim_binary",
+    second_phase=SecondPhaseRanking(expression="max_sim", rerank_count=10),
+    functions=mapfunctions
+    + [
+        Function(
+            name="max_sim",
+            expression="""
+                sum(
+                    reduce(
+                        sum(
+                            query(qt) * unpack_bits(attribute(embedding)), v
+                        ),
+                        max, patch
+                    ),
+                    querytoken
+                )
+            """,
+        ),
+        Function(
+            name="max_sim_binary",
+            expression="""
+                sum(
+                    reduce(
+                        1 / (1 + sum(
+                            hamming(query(qtb), attribute(embedding)), v)
+                        ),
+                        max, patch
+                    ),
+                    querytoken
+                )
+            """,
+        ),
+    ],
+    summary_features=["quantized"],
+)
+colpali_schema.add_rank_profile(colpali_retrieval_profile)
+# %%
+from vespa.configuration.services import (
+    services,
+    container,
+    search,
+    document_api,
+    document_processing,
+    clients,
+    client,
+    config,
+    content,
+    redundancy,
+    documents,
+    node,
+    certificate,
+    token,
+    document,
+    nodes,
+)
+from vespa.configuration.vt import vt
+from vespa.package import ServicesConfiguration
+service_config = ServicesConfiguration(
+    application_name=VESPA_APPLICATION_NAME,
+    services_config=services(
+        container(
+            search(),
+            document_api(),
+            document_processing(),
+            clients(
+                client(
+                    certificate(file="security/clients.pem"),
+                    id="mtls",
+                    permissions="read,write",
+                ),
+                client(
+                    token(id=f"{VESPA_TOKEN_ID_WRITE}"),
+                    id="token_write",
+                    permissions="read,write",
+                ),
+                client(
+                    token(id=f"{VESPA_TOKEN_ID_READ}"),
+                    id="token_read",
+                    permissions="read",
+                ),
+            ),
+            config(
+                vt("tag")(
+                    vt("bold")(
+                        vt("open", "<strong>"),
+                        vt("close", "</strong>"),
+                    ),
+                    vt("separator", "..."),
+                ),
+                name="container.qr-searchers",
+            ),
+            id=f"{VESPA_APPLICATION_NAME}_container",
+            version="1.0",
+        ),
+        content(
+            redundancy("1"),
+            documents(document(type="pdf_page", mode="index")),
+            nodes(node(distribution_key="0", hostalias="node1")),
+            config(
+                vt("max_matches", "2", replace_underscores=False),
+                vt("length", "1000"),
+                vt("surround_max", "500", replace_underscores=False),
+                vt("min_length", "300", replace_underscores=False),
+                name="vespa.config.search.summary.juniperrc",
+            ),
+            id=f"{VESPA_APPLICATION_NAME}_content",
+            version="1.0",
+        ),
+        version="1.0",
+    ),
+)
+# %%
+# Create the Vespa application package
+vespa_application_package = ApplicationPackage(
+    name=VESPA_APPLICATION_NAME,
+    schema=[colpali_schema],
+    services_config=service_config,
+)
+# %% [markdown]
+# ## 6. Deploy Vespa Application
+#
+# %%
+VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
+    "Enter Vespa team API key: "
+)
+# %%
+vespa_cloud = VespaCloud(
+    tenant=VESPA_TENANT_NAME,
+    application=VESPA_APPLICATION_NAME,
+    key_content=VESPA_TEAM_API_KEY,
+    application_package=vespa_application_package,
+)
+# Deploy the application
+vespa_cloud.deploy()
+# Output the endpoint URL
+endpoint_url = vespa_cloud.get_token_endpoint()
+print(f"Application deployed. Token endpoint URL: {endpoint_url}")
+# %% [markdown]
+# Make sure to take note of the token endpoint_url.
+# You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
+#
+# %% [markdown]
+# ## 8. Feed Data to Vespa
+#
+# %%
+# Instantiate Vespa connection using token
+app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
+app.get_application_status()
+# %%
+def callback(response: VespaResponse, id: str):
+    if not response.is_successful():
+        print(
+            f"Failed to feed document {id} with status code {response.status_code}: Reason {response.get_json()}"
+        )
+# Feed data into Vespa asynchronously
+app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)

pyproject.toml CHANGED Viewed

@@ -8,7 +8,7 @@ license = { text = "Apache-2.0" }
 dependencies = [
     "python-fasthtml",
     "huggingface-hub",
-    "pyvespa@git+https://github.com/vespa-engine/pyvespa",
     "vespacli",
     "torch",
     "vidore-benchmark[interpretability]>=4.0.0,<5.0.0",
@@ -18,6 +18,7 @@ dependencies = [
     "setuptools",
     "python-dotenv",
     "shad4fast>=1.2.1",
 ]
 # dev-dependencies
@@ -27,3 +28,11 @@ dev = [
     "python-dotenv",
     "huggingface_hub[cli]"
 ]

 dependencies = [
     "python-fasthtml",
     "huggingface-hub",
+    "pyvespa>=0.50.0",
     "vespacli",
     "torch",
     "vidore-benchmark[interpretability]>=4.0.0,<5.0.0",
     "setuptools",
     "python-dotenv",
     "shad4fast>=1.2.1",
+    "google-generativeai>=0.7.2"
 ]
 # dev-dependencies
     "python-dotenv",
     "huggingface_hub[cli]"
 ]
+feed = [
+    "ipykernel",
+    "jupytext",
+    "pydantic",
+    "beautifulsoup4",
+    "pdf2image",
+    "google-generativeai"
+]

static/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff