thomasht86 commited on
Commit
5d22e58
β€’
1 Parent(s): fa270d9

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -126,6 +126,16 @@ python main.py
126
 
127
  ## Deploy to huggingface πŸ€—
128
 
 
 
 
 
 
 
 
 
 
 
129
  To deploy, run
130
 
131
  ```bash
 
126
 
127
  ## Deploy to huggingface πŸ€—
128
 
129
+ ### Compiling dependencies
130
+
131
+ Before a deploy, make sure to run this to compile the `uv` lock file to `requirements.txt` if you have made changes to the dependencies:
132
+
133
+ ```bash
134
+ uv pip compile pyproject.toml -o requirements.txt
135
+ ```
136
+
137
+ ### Deploying to huggingface
138
+
139
  To deploy, run
140
 
141
  ```bash
prepare_feed_deploy.py CHANGED
@@ -1,16 +1,16 @@
1
  # %% [markdown]
2
  # # Visual PDF Retrieval - demo application
3
- #
4
  # In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
5
  # We will use ColPali as the model to extract patch vectors from images of pdf pages.
6
  # At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
7
- #
8
  # To see the application in action, visit TODO:
9
- #
10
  # The web application is written in FastHTML, meaning the complete application is written in python.
11
- #
12
  # The steps we will take in this notebook are:
13
- #
14
  # 0. Setup and configuration
15
  # 1. Download the data
16
  # 2. Prepare the data
@@ -18,14 +18,14 @@
18
  # 4. Deploy the Vespa application
19
  # 5. Create the Vespa application
20
  # 6. Feed the data to the Vespa application
21
- #
22
  # All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
23
  # We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
24
- #
25
 
26
  # %% [markdown]
27
  # ## 0. Setup and Configuration
28
- #
29
 
30
  # %%
31
  import os
@@ -83,11 +83,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
83
 
84
  # %% [markdown]
85
  # ### Create a free trial in Vespa Cloud
86
- #
87
  # Create a tenant from [here](https://vespa.ai/free-trial/).
88
  # The trial includes $300 credit.
89
  # Take note of your tenant name.
90
- #
91
 
92
  # %%
93
  VESPA_TENANT_NAME = "vespa-team"
@@ -95,17 +95,17 @@ VESPA_TENANT_NAME = "vespa-team"
95
  # %% [markdown]
96
  # Here, set your desired application name. (Will be created in later steps)
97
  # Note that you can not have hyphen `-` or underscore `_` in the application name.
98
- #
99
 
100
  # %%
101
- VESPA_APPLICATION_NAME = "colpalidemo2"
102
  VESPA_SCHEMA_NAME = "pdf_page"
103
 
104
  # %% [markdown]
105
  # Next, you need to create some tokens for feeding data, and querying the application.
106
  # We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
107
  # The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
108
- #
109
 
110
  # %%
111
  VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
@@ -113,7 +113,7 @@ VESPA_TOKEN_ID_READ = "colpalidemo_read"
113
 
114
  # %% [markdown]
115
  # We also need to set the value of the write token to be able to feed data to the Vespa application.
116
- #
117
 
118
  # %%
119
  VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
@@ -124,7 +124,7 @@ VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
124
  # We will also use the Gemini API to create sample queries for our images.
125
  # You can also use other VLM's to create these queries.
126
  # Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
127
- #
128
 
129
  # %%
130
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
@@ -152,21 +152,21 @@ processor = ColPaliProcessor.from_pretrained(MODEL_NAME)
152
 
153
  # %% [markdown]
154
  # ## 1. Download PDFs
155
- #
156
  # We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
157
  # The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
158
- #
159
  # These reports are the ones we are going to use for this showcase.
160
  # Here are some sample images:
161
- #
162
  # ![Sample1](./static/img/gfpg-sample-1.png)
163
  # ![Sample2](./static/img/gfpg-sample-2.png)
164
- #
165
 
166
  # %% [markdown]
167
  # As we can see, a lot of the information is in the form of tables, charts and numbers.
168
  # These are not easily extractable using pdf-readers or OCR tools.
169
- #
170
 
171
  # %%
172
  import requests
@@ -180,16 +180,20 @@ html_content = response.text
180
  soup = BeautifulSoup(html_content, "html.parser")
181
 
182
  links = []
 
183
 
184
- # Find all <a> elements with the specific classes
185
- for a_tag in soup.find_all("a", href=True):
186
- classes = a_tag.get("class", [])
187
- if "button" in classes and "button--download-secondary" in classes:
 
 
 
188
  href = a_tag["href"]
189
  full_url = urljoin(url, href)
190
  links.append(full_url)
191
-
192
- links
193
 
194
  # %%
195
  # Limit the number of PDFs to download
@@ -274,7 +278,8 @@ pdfs
274
 
275
  # %% [markdown]
276
  # ## 2. Convert PDFs to Images
277
- #
 
278
 
279
  # %%
280
  def get_pdf_images(pdf_path):
@@ -300,6 +305,7 @@ for pdf in tqdm(pdfs):
300
  pdf_pages.append(
301
  {
302
  "title": title,
 
303
  "url": pdf["url"],
304
  "path": pdf_file,
305
  "image": image,
@@ -324,17 +330,17 @@ print(f"Number of text with length == 0: {Counter(text_lengths)[0]}")
324
 
325
  # %% [markdown]
326
  # ## 3. Generate Queries
327
- #
328
  # In this step, we want to generate queries for each page image.
329
  # These will be useful for 2 reasons:
330
- #
331
  # 1. We can use these queries as typeahead suggestions in the search bar.
332
  # 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
333
- #
334
  # The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
335
- #
336
  # We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
337
- #
338
 
339
  # %%
340
  from pydantic import BaseModel
@@ -413,6 +419,7 @@ def generate_queries(image, prompt_text, pydantic_model):
413
  }
414
  return queries
415
 
 
416
  # %%
417
  for pdf in tqdm(pdf_pages):
418
  image = pdf.get("image")
@@ -488,9 +495,10 @@ with open("output/pdf_pages.json", "w") as f:
488
 
489
  # %% [markdown]
490
  # ## 4. Generate embeddings
491
- #
492
  # Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
493
- #
 
494
 
495
  # %%
496
  def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
@@ -530,6 +538,7 @@ def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
530
  all_embeddings = np.concatenate(embeddings_list, axis=0)
531
  return all_embeddings
532
 
 
533
  # %%
534
  # Generate embeddings for all images
535
  images = [pdf["image"] for pdf in pdf_pages]
@@ -540,9 +549,10 @@ embeddings.shape
540
 
541
  # %% [markdown]
542
  # ## 5. Prepare Data on Vespa Format
543
- #
544
  # Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
545
- #
 
546
 
547
  # %%
548
  def float_to_binary_embedding(float_query_embedding: dict) -> dict:
@@ -555,10 +565,12 @@ def float_to_binary_embedding(float_query_embedding: dict) -> dict:
555
  binary_query_embeddings[k] = binary_vector
556
  return binary_query_embeddings
557
 
 
558
  # %%
559
  vespa_feed = []
560
  for pdf, embedding in zip(pdf_pages, embeddings):
561
  url = pdf["url"]
 
562
  title = pdf["title"]
563
  image = pdf["image"]
564
  text = pdf.get("text", "")
@@ -580,6 +592,7 @@ for pdf, embedding in zip(pdf_pages, embeddings):
580
  "id": id_hash,
581
  "url": url,
582
  "title": title,
 
583
  "page_number": page_no,
584
  "blur_image": base_64_image,
585
  "full_image": base_64_full_image,
@@ -616,7 +629,7 @@ len(vespa_feed)
616
 
617
  # %% [markdown]
618
  # ## 5. Prepare Vespa Application
619
- #
620
 
621
  # %%
622
  # Define the Vespa schema
@@ -631,6 +644,7 @@ colpali_schema = Schema(
631
  match=["word"],
632
  ),
633
  Field(name="url", type="string", indexing=["summary", "index"]),
 
634
  Field(
635
  name="title",
636
  type="string",
@@ -720,9 +734,7 @@ colpali_schema = Schema(
720
  DocumentSummary(
721
  name="suggestions",
722
  summary_fields=[
723
- Summary(
724
- name="questions"
725
- ),
726
  ],
727
  from_disk=True,
728
  ),
@@ -756,11 +768,12 @@ mapfunctions = [
756
  # Define the 'bm25' rank profile
757
  colpali_bm25_profile = RankProfile(
758
  name="bm25",
759
- inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
760
  first_phase="bm25(title) + bm25(text)",
761
  functions=mapfunctions,
762
  )
763
 
 
764
  # A function to create an inherited rank profile which also returns quantized similarity scores
765
  def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
766
  return RankProfile(
@@ -770,6 +783,7 @@ def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
770
  summary_features=["quantized"],
771
  )
772
 
 
773
  colpali_schema.add_rank_profile(colpali_bm25_profile)
774
  colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
775
 
@@ -941,7 +955,7 @@ vespa_application_package = ApplicationPackage(
941
 
942
  # %% [markdown]
943
  # ## 6. Deploy Vespa Application
944
- #
945
 
946
  # %%
947
  VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
@@ -966,17 +980,18 @@ print(f"Application deployed. Token endpoint URL: {endpoint_url}")
966
  # %% [markdown]
967
  # Make sure to take note of the token endpoint_url.
968
  # You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
969
- #
970
 
971
  # %% [markdown]
972
  # ## 8. Feed Data to Vespa
973
- #
974
 
975
  # %%
976
  # Instantiate Vespa connection using token
977
  app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
978
  app.get_application_status()
979
 
 
980
  # %%
981
  def callback(response: VespaResponse, id: str):
982
  if not response.is_successful():
@@ -987,5 +1002,3 @@ def callback(response: VespaResponse, id: str):
987
 
988
  # Feed data into Vespa asynchronously
989
  app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
990
-
991
-
 
1
  # %% [markdown]
2
  # # Visual PDF Retrieval - demo application
3
+ #
4
  # In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
5
  # We will use ColPali as the model to extract patch vectors from images of pdf pages.
6
  # At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
7
+ #
8
  # To see the application in action, visit TODO:
9
+ #
10
  # The web application is written in FastHTML, meaning the complete application is written in python.
11
+ #
12
  # The steps we will take in this notebook are:
13
+ #
14
  # 0. Setup and configuration
15
  # 1. Download the data
16
  # 2. Prepare the data
 
18
  # 4. Deploy the Vespa application
19
  # 5. Create the Vespa application
20
  # 6. Feed the data to the Vespa application
21
+ #
22
  # All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
23
  # We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
24
+ #
25
 
26
  # %% [markdown]
27
  # ## 0. Setup and Configuration
28
+ #
29
 
30
  # %%
31
  import os
 
83
 
84
  # %% [markdown]
85
  # ### Create a free trial in Vespa Cloud
86
+ #
87
  # Create a tenant from [here](https://vespa.ai/free-trial/).
88
  # The trial includes $300 credit.
89
  # Take note of your tenant name.
90
+ #
91
 
92
  # %%
93
  VESPA_TENANT_NAME = "vespa-team"
 
95
  # %% [markdown]
96
  # Here, set your desired application name. (Will be created in later steps)
97
  # Note that you can not have hyphen `-` or underscore `_` in the application name.
98
+ #
99
 
100
  # %%
101
+ VESPA_APPLICATION_NAME = "colpalidemo"
102
  VESPA_SCHEMA_NAME = "pdf_page"
103
 
104
  # %% [markdown]
105
  # Next, you need to create some tokens for feeding data, and querying the application.
106
  # We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
107
  # The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
108
+ #
109
 
110
  # %%
111
  VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
 
113
 
114
  # %% [markdown]
115
  # We also need to set the value of the write token to be able to feed data to the Vespa application.
116
+ #
117
 
118
  # %%
119
  VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
 
124
  # We will also use the Gemini API to create sample queries for our images.
125
  # You can also use other VLM's to create these queries.
126
  # Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
127
+ #
128
 
129
  # %%
130
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
 
152
 
153
  # %% [markdown]
154
  # ## 1. Download PDFs
155
+ #
156
  # We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
157
  # The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
158
+ #
159
  # These reports are the ones we are going to use for this showcase.
160
  # Here are some sample images:
161
+ #
162
  # ![Sample1](./static/img/gfpg-sample-1.png)
163
  # ![Sample2](./static/img/gfpg-sample-2.png)
164
+ #
165
 
166
  # %% [markdown]
167
  # As we can see, a lot of the information is in the form of tables, charts and numbers.
168
  # These are not easily extractable using pdf-readers or OCR tools.
169
+ #
170
 
171
  # %%
172
  import requests
 
180
  soup = BeautifulSoup(html_content, "html.parser")
181
 
182
  links = []
183
+ url_to_year = {}
184
 
185
+ # Find all 'div's with id starting with 'year-'
186
+ for year_div in soup.find_all("div", id=lambda x: x and x.startswith("year-")):
187
+ year_id = year_div.get("id", "")
188
+ year = year_id.replace("year-", "")
189
+
190
+ # Within this div, find all 'a' elements with the specific classes
191
+ for a_tag in year_div.select("a.button.button--download-secondary[href]"):
192
  href = a_tag["href"]
193
  full_url = urljoin(url, href)
194
  links.append(full_url)
195
+ url_to_year[full_url] = year
196
+ links, url_to_year
197
 
198
  # %%
199
  # Limit the number of PDFs to download
 
278
 
279
  # %% [markdown]
280
  # ## 2. Convert PDFs to Images
281
+ #
282
+
283
 
284
  # %%
285
  def get_pdf_images(pdf_path):
 
305
  pdf_pages.append(
306
  {
307
  "title": title,
308
+ "year": int(url_to_year[pdf["url"]]),
309
  "url": pdf["url"],
310
  "path": pdf_file,
311
  "image": image,
 
330
 
331
  # %% [markdown]
332
  # ## 3. Generate Queries
333
+ #
334
  # In this step, we want to generate queries for each page image.
335
  # These will be useful for 2 reasons:
336
+ #
337
  # 1. We can use these queries as typeahead suggestions in the search bar.
338
  # 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
339
+ #
340
  # The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
341
+ #
342
  # We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
343
+ #
344
 
345
  # %%
346
  from pydantic import BaseModel
 
419
  }
420
  return queries
421
 
422
+
423
  # %%
424
  for pdf in tqdm(pdf_pages):
425
  image = pdf.get("image")
 
495
 
496
  # %% [markdown]
497
  # ## 4. Generate embeddings
498
+ #
499
  # Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
500
+ #
501
+
502
 
503
  # %%
504
  def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
 
538
  all_embeddings = np.concatenate(embeddings_list, axis=0)
539
  return all_embeddings
540
 
541
+
542
  # %%
543
  # Generate embeddings for all images
544
  images = [pdf["image"] for pdf in pdf_pages]
 
549
 
550
  # %% [markdown]
551
  # ## 5. Prepare Data on Vespa Format
552
+ #
553
  # Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
554
+ #
555
+
556
 
557
  # %%
558
  def float_to_binary_embedding(float_query_embedding: dict) -> dict:
 
565
  binary_query_embeddings[k] = binary_vector
566
  return binary_query_embeddings
567
 
568
+
569
  # %%
570
  vespa_feed = []
571
  for pdf, embedding in zip(pdf_pages, embeddings):
572
  url = pdf["url"]
573
+ year = pdf["year"]
574
  title = pdf["title"]
575
  image = pdf["image"]
576
  text = pdf.get("text", "")
 
592
  "id": id_hash,
593
  "url": url,
594
  "title": title,
595
+ "year": year,
596
  "page_number": page_no,
597
  "blur_image": base_64_image,
598
  "full_image": base_64_full_image,
 
629
 
630
  # %% [markdown]
631
  # ## 5. Prepare Vespa Application
632
+ #
633
 
634
  # %%
635
  # Define the Vespa schema
 
644
  match=["word"],
645
  ),
646
  Field(name="url", type="string", indexing=["summary", "index"]),
647
+ Field(name="year", type="int", indexing=["summary", "attribute"]),
648
  Field(
649
  name="title",
650
  type="string",
 
734
  DocumentSummary(
735
  name="suggestions",
736
  summary_fields=[
737
+ Summary(name="questions"),
 
 
738
  ],
739
  from_disk=True,
740
  ),
 
768
  # Define the 'bm25' rank profile
769
  colpali_bm25_profile = RankProfile(
770
  name="bm25",
771
+ inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
772
  first_phase="bm25(title) + bm25(text)",
773
  functions=mapfunctions,
774
  )
775
 
776
+
777
  # A function to create an inherited rank profile which also returns quantized similarity scores
778
  def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
779
  return RankProfile(
 
783
  summary_features=["quantized"],
784
  )
785
 
786
+
787
  colpali_schema.add_rank_profile(colpali_bm25_profile)
788
  colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
789
 
 
955
 
956
  # %% [markdown]
957
  # ## 6. Deploy Vespa Application
958
+ #
959
 
960
  # %%
961
  VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
 
980
  # %% [markdown]
981
  # Make sure to take note of the token endpoint_url.
982
  # You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
983
+ #
984
 
985
  # %% [markdown]
986
  # ## 8. Feed Data to Vespa
987
+ #
988
 
989
  # %%
990
  # Instantiate Vespa connection using token
991
  app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
992
  app.get_application_status()
993
 
994
+
995
  # %%
996
  def callback(response: VespaResponse, id: str):
997
  if not response.is_successful():
 
1002
 
1003
  # Feed data into Vespa asynchronously
1004
  app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
 
 
requirements.txt CHANGED
@@ -24,8 +24,15 @@ attrs==24.2.0
24
  # via aiohttp
25
  beautifulsoup4==4.12.3
26
  # via python-fasthtml
 
 
27
  cachetools==5.5.0
28
  # via google-auth
 
 
 
 
 
29
  certifi==2024.8.30
30
  # via
31
  # httpcore
@@ -39,16 +46,27 @@ click==8.1.7
39
  # via
40
  # typer
41
  # uvicorn
 
 
42
  colpali-engine==0.3.1
43
  # via
44
  # visual-retrieval-colpali (pyproject.toml)
45
  # vidore-benchmark
 
 
 
 
46
  contourpy==1.3.0
47
  # via matplotlib
48
  cryptography==43.0.1
49
  # via pyvespa
50
  cycler==0.12.1
51
  # via matplotlib
 
 
 
 
 
52
  datasets==2.21.0
53
  # via
54
  # mteb
@@ -168,11 +186,16 @@ itsdangerous==2.2.0
168
  jinja2==3.1.4
169
  # via
170
  # pyvespa
 
171
  # torch
172
  joblib==1.4.2
173
  # via scikit-learn
174
  kiwisolver==1.4.7
175
  # via matplotlib
 
 
 
 
176
  loguru==0.7.2
177
  # via vidore-benchmark
178
  lucide-fasthtml==0.0.9
@@ -181,6 +204,8 @@ lxml==5.3.0
181
  # via
182
  # lucide-fasthtml
183
  # pyvespa
 
 
184
  markdown-it-py==3.0.0
185
  # via rich
186
  markupsafe==2.1.5
@@ -201,11 +226,17 @@ multidict==6.1.0
201
  # yarl
202
  multiprocess==0.70.16
203
  # via datasets
 
 
 
 
 
204
  networkx==3.3
205
  # via torch
206
  numpy==1.26.4
207
  # via
208
  # accelerate
 
209
  # colpali-engine
210
  # contourpy
211
  # datasets
@@ -217,6 +248,8 @@ numpy==1.26.4
217
  # scikit-learn
218
  # scipy
219
  # seaborn
 
 
220
  # transformers
221
  # vidore-benchmark
222
  oauthlib==3.2.2
@@ -229,7 +262,10 @@ packaging==24.1
229
  # huggingface-hub
230
  # matplotlib
231
  # peft
 
 
232
  # transformers
 
233
  pandas==2.2.3
234
  # via
235
  # datasets
@@ -247,8 +283,14 @@ pillow==10.4.0
247
  # pdf2image
248
  # sentence-transformers
249
  # vidore-benchmark
 
 
250
  polars==1.9.0
251
  # via mteb
 
 
 
 
252
  proto-plus==1.24.0
253
  # via
254
  # google-ai-generativelanguage
@@ -277,8 +319,12 @@ pycparser==2.22
277
  # via cffi
278
  pydantic==2.9.2
279
  # via
 
280
  # google-generativeai
281
  # mteb
 
 
 
282
  pydantic-core==2.23.4
283
  # via pydantic
284
  pygments==2.18.0
@@ -334,7 +380,9 @@ requests==2.32.3
334
  # mteb
335
  # pyvespa
336
  # requests-toolbelt
 
337
  # transformers
 
338
  requests-toolbelt==1.0.0
339
  # via pyvespa
340
  rich==13.9.2
@@ -366,27 +414,47 @@ sentence-transformers==3.1.1
366
  sentencepiece==0.2.0
367
  # via vidore-benchmark
368
  setuptools==75.1.0
369
- # via visual-retrieval-colpali (pyproject.toml)
 
 
 
 
370
  shad4fast==1.2.1
371
  # via visual-retrieval-colpali (pyproject.toml)
372
  shellingham==1.5.4
373
  # via typer
374
  six==1.16.0
375
  # via python-dateutil
 
 
376
  sniffio==1.3.1
377
  # via
378
  # anyio
379
  # httpx
380
  soupsieve==2.6
381
  # via beautifulsoup4
 
 
 
 
 
 
382
  sqlite-minutils==3.37.0.post3
383
  # via fastlite
 
 
 
 
 
 
384
  starlette==0.39.2
385
  # via python-fasthtml
386
  sympy==1.13.3
387
  # via torch
388
  tenacity==9.0.0
389
  # via pyvespa
 
 
390
  threadpoolctl==3.5.0
391
  # via scikit-learn
392
  tokenizers==0.20.0
@@ -408,6 +476,7 @@ tqdm==4.66.5
408
  # mteb
409
  # peft
410
  # sentence-transformers
 
411
  # transformers
412
  transformers==4.45.1
413
  # via
@@ -416,10 +485,14 @@ transformers==4.45.1
416
  # sentence-transformers
417
  # vidore-benchmark
418
  typer==0.12.5
419
- # via vidore-benchmark
 
 
 
420
  typing-extensions==4.12.2
421
  # via
422
  # anyio
 
423
  # google-generativeai
424
  # huggingface-hub
425
  # mteb
@@ -448,10 +521,19 @@ vespacli==8.391.23
448
  # via visual-retrieval-colpali (pyproject.toml)
449
  vidore-benchmark==4.0.0
450
  # via visual-retrieval-colpali (pyproject.toml)
 
 
 
 
 
451
  watchfiles==0.24.0
452
  # via uvicorn
 
 
453
  websockets==13.1
454
  # via uvicorn
 
 
455
  xxhash==3.5.0
456
  # via datasets
457
  yarl==1.13.1
 
24
  # via aiohttp
25
  beautifulsoup4==4.12.3
26
  # via python-fasthtml
27
+ blis==0.7.11
28
+ # via thinc
29
  cachetools==5.5.0
30
  # via google-auth
31
+ catalogue==2.0.10
32
+ # via
33
+ # spacy
34
+ # srsly
35
+ # thinc
36
  certifi==2024.8.30
37
  # via
38
  # httpcore
 
46
  # via
47
  # typer
48
  # uvicorn
49
+ cloudpathlib==0.20.0
50
+ # via weasel
51
  colpali-engine==0.3.1
52
  # via
53
  # visual-retrieval-colpali (pyproject.toml)
54
  # vidore-benchmark
55
+ confection==0.1.5
56
+ # via
57
+ # thinc
58
+ # weasel
59
  contourpy==1.3.0
60
  # via matplotlib
61
  cryptography==43.0.1
62
  # via pyvespa
63
  cycler==0.12.1
64
  # via matplotlib
65
+ cymem==2.0.8
66
+ # via
67
+ # preshed
68
+ # spacy
69
+ # thinc
70
  datasets==2.21.0
71
  # via
72
  # mteb
 
186
  jinja2==3.1.4
187
  # via
188
  # pyvespa
189
+ # spacy
190
  # torch
191
  joblib==1.4.2
192
  # via scikit-learn
193
  kiwisolver==1.4.7
194
  # via matplotlib
195
+ langcodes==3.4.1
196
+ # via spacy
197
+ language-data==1.2.0
198
+ # via langcodes
199
  loguru==0.7.2
200
  # via vidore-benchmark
201
  lucide-fasthtml==0.0.9
 
204
  # via
205
  # lucide-fasthtml
206
  # pyvespa
207
+ marisa-trie==1.2.1
208
+ # via language-data
209
  markdown-it-py==3.0.0
210
  # via rich
211
  markupsafe==2.1.5
 
226
  # yarl
227
  multiprocess==0.70.16
228
  # via datasets
229
+ murmurhash==1.0.10
230
+ # via
231
+ # preshed
232
+ # spacy
233
+ # thinc
234
  networkx==3.3
235
  # via torch
236
  numpy==1.26.4
237
  # via
238
  # accelerate
239
+ # blis
240
  # colpali-engine
241
  # contourpy
242
  # datasets
 
248
  # scikit-learn
249
  # scipy
250
  # seaborn
251
+ # spacy
252
+ # thinc
253
  # transformers
254
  # vidore-benchmark
255
  oauthlib==3.2.2
 
262
  # huggingface-hub
263
  # matplotlib
264
  # peft
265
+ # spacy
266
+ # thinc
267
  # transformers
268
+ # weasel
269
  pandas==2.2.3
270
  # via
271
  # datasets
 
283
  # pdf2image
284
  # sentence-transformers
285
  # vidore-benchmark
286
+ pip==24.3.1
287
+ # via visual-retrieval-colpali (pyproject.toml)
288
  polars==1.9.0
289
  # via mteb
290
+ preshed==3.0.9
291
+ # via
292
+ # spacy
293
+ # thinc
294
  proto-plus==1.24.0
295
  # via
296
  # google-ai-generativelanguage
 
319
  # via cffi
320
  pydantic==2.9.2
321
  # via
322
+ # confection
323
  # google-generativeai
324
  # mteb
325
+ # spacy
326
+ # thinc
327
+ # weasel
328
  pydantic-core==2.23.4
329
  # via pydantic
330
  pygments==2.18.0
 
380
  # mteb
381
  # pyvespa
382
  # requests-toolbelt
383
+ # spacy
384
  # transformers
385
+ # weasel
386
  requests-toolbelt==1.0.0
387
  # via pyvespa
388
  rich==13.9.2
 
414
  sentencepiece==0.2.0
415
  # via vidore-benchmark
416
  setuptools==75.1.0
417
+ # via
418
+ # visual-retrieval-colpali (pyproject.toml)
419
+ # marisa-trie
420
+ # spacy
421
+ # thinc
422
  shad4fast==1.2.1
423
  # via visual-retrieval-colpali (pyproject.toml)
424
  shellingham==1.5.4
425
  # via typer
426
  six==1.16.0
427
  # via python-dateutil
428
+ smart-open==7.0.5
429
+ # via weasel
430
  sniffio==1.3.1
431
  # via
432
  # anyio
433
  # httpx
434
  soupsieve==2.6
435
  # via beautifulsoup4
436
+ spacy==3.7.5
437
+ # via visual-retrieval-colpali (pyproject.toml)
438
+ spacy-legacy==3.0.12
439
+ # via spacy
440
+ spacy-loggers==1.0.5
441
+ # via spacy
442
  sqlite-minutils==3.37.0.post3
443
  # via fastlite
444
+ srsly==2.4.8
445
+ # via
446
+ # confection
447
+ # spacy
448
+ # thinc
449
+ # weasel
450
  starlette==0.39.2
451
  # via python-fasthtml
452
  sympy==1.13.3
453
  # via torch
454
  tenacity==9.0.0
455
  # via pyvespa
456
+ thinc==8.2.5
457
+ # via spacy
458
  threadpoolctl==3.5.0
459
  # via scikit-learn
460
  tokenizers==0.20.0
 
476
  # mteb
477
  # peft
478
  # sentence-transformers
479
+ # spacy
480
  # transformers
481
  transformers==4.45.1
482
  # via
 
485
  # sentence-transformers
486
  # vidore-benchmark
487
  typer==0.12.5
488
+ # via
489
+ # spacy
490
+ # vidore-benchmark
491
+ # weasel
492
  typing-extensions==4.12.2
493
  # via
494
  # anyio
495
+ # cloudpathlib
496
  # google-generativeai
497
  # huggingface-hub
498
  # mteb
 
521
  # via visual-retrieval-colpali (pyproject.toml)
522
  vidore-benchmark==4.0.0
523
  # via visual-retrieval-colpali (pyproject.toml)
524
+ wasabi==1.1.3
525
+ # via
526
+ # spacy
527
+ # thinc
528
+ # weasel
529
  watchfiles==0.24.0
530
  # via uvicorn
531
+ weasel==0.4.1
532
+ # via spacy
533
  websockets==13.1
534
  # via uvicorn
535
+ wrapt==1.16.0
536
+ # via smart-open
537
  xxhash==3.5.0
538
  # via datasets
539
  yarl==1.13.1
vespa_feed_to_hf_dataset.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import base64
5
+ from PIL import Image
6
+ import io
7
+ from datasets import Dataset, Image as HFImage
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+
11
+ load_dotenv()
12
+
13
+ df = pd.read_json("output/vespa_feed_full.jsonl", lines=True)
14
+ df = pd.json_normalize(df["fields"].tolist())
15
+
16
+ dataset_dir = Path("hf_dataset")
17
+ image_dir = dataset_dir / "images"
18
+ os.makedirs(image_dir, exist_ok=True)
19
+
20
+
21
+ def save_image(image_data, filename):
22
+ img_data = base64.b64decode(image_data)
23
+ img = Image.open(io.BytesIO(img_data))
24
+ img.save(filename)
25
+
26
+
27
+ for idx, row in tqdm(df.iterrows()):
28
+ blur_filename = os.path.join(image_dir, f"blur_{idx}.jpg")
29
+ full_filename = os.path.join(image_dir, f"full_{idx}.jpg")
30
+ save_image(row["blur_image"], blur_filename)
31
+ save_image(row["full_image"], full_filename)
32
+ df.at[idx, "blur_image"] = blur_filename
33
+ df.at[idx, "full_image"] = full_filename
34
+
35
+
36
+ # Step 3: Convert to Hugging Face Dataset
37
+ dataset = (
38
+ Dataset.from_dict(df.to_dict(orient="list"))
39
+ .cast_column("blur_image", HFImage())
40
+ .cast_column("full_image", HFImage())
41
+ )
42
+ dataset.push_to_hub("vespa-engine/gpfg-QA", private=True)