dinhquangson commited on
Commit
4520e07
1 Parent(s): 9f7a757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -136,14 +136,30 @@ async def create_upload_file(text_field: str, file: UploadFile = File(...), ocr:
136
 
137
 
138
  indexing = Pipeline()
 
 
 
 
 
 
 
 
 
 
 
139
  indexing.add_component("sparse_doc_embedder", FastembedSparseDocumentEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
140
  indexing.add_component("dense_doc_embedder", FastembedDocumentEmbedder(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"))
141
  indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
 
142
 
 
 
 
 
143
  indexing.connect("sparse_doc_embedder", "dense_doc_embedder")
144
  indexing.connect("dense_doc_embedder", "writer")
145
 
146
- indexing.run({"sparse_doc_embedder": {"documents": documents}})
147
  end_time = time.time()
148
 
149
  elapsed_time = end_time - start_time
@@ -213,7 +229,7 @@ def search(prompt: str):
213
  {"dense_text_embedder": {"text": prompt},
214
  "sparse_text_embedder": {"text": prompt},
215
  "ranker": {"query": prompt},
216
- "prompt_builder": {"documents": documents, "query": prompt},
217
  "llm": {"query": prompt},
218
  }
219
  )
 
136
 
137
 
138
  indexing = Pipeline()
139
+
140
+ document_joiner = DocumentJoiner()
141
+
142
+
143
+ document_cleaner = DocumentCleaner()
144
+
145
+ document_splitter = DocumentSplitter(split_by="word", split_length=1000, split_overlap=0)
146
+
147
+ indexing.add_component("document_joiner", document_joiner)
148
+ indexing.add_component("document_cleaner", document_cleaner)
149
+ indexing.add_component("document_splitter", document_splitter)
150
  indexing.add_component("sparse_doc_embedder", FastembedSparseDocumentEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
151
  indexing.add_component("dense_doc_embedder", FastembedDocumentEmbedder(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"))
152
  indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
153
+
154
 
155
+ indexing.connect("document_joiner", "document_cleaner")
156
+ indexing.connect("document_cleaner", "document_splitter")
157
+ indexing.connect("document_splitter", "document_embedder")
158
+
159
  indexing.connect("sparse_doc_embedder", "dense_doc_embedder")
160
  indexing.connect("dense_doc_embedder", "writer")
161
 
162
+ indexing.run({"document_joiner": {"documents": documents}})
163
  end_time = time.time()
164
 
165
  elapsed_time = end_time - start_time
 
229
  {"dense_text_embedder": {"text": prompt},
230
  "sparse_text_embedder": {"text": prompt},
231
  "ranker": {"query": prompt},
232
+ "prompt_builder": {"query": prompt},
233
  "llm": {"query": prompt},
234
  }
235
  )