Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .gitignore +3 -0
- README.md +1 -1
- app.py +44 -141
- assets/axionable.svg +24 -0
- climateqa/engine/embeddings.py +6 -2
- climateqa/engine/vectorstore.py +77 -76
- requirements.txt +2 -3
- sources/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf +3 -0
- style.css +73 -1
- test +3 -6
.gitattributes
CHANGED
@@ -37,3 +37,4 @@ PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-
|
|
37 |
PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
vectors/index.annoy filter=lfs diff=lfs merge=lfs -text
|
|
|
|
37 |
PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
vectors/index.annoy filter=lfs diff=lfs merge=lfs -text
|
40 |
+
sources/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -9,6 +9,9 @@ setAPIKEY.sh
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
|
|
|
|
|
|
12 |
# Icon must end with two \r
|
13 |
Icon
|
14 |
|
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
12 |
+
# Historique conversasion with chatbot
|
13 |
+
*.json
|
14 |
+
|
15 |
# Icon must end with two \r
|
16 |
Icon
|
17 |
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
|
|
1 |
---
|
2 |
+
title: clara
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
|
|
|
|
1 |
# , get_pinecone_vectorstore, find_similar_vectors
|
2 |
-
from climateqa.engine.vectorstore import build_vectores_stores
|
3 |
-
from climateqa.engine.rag import make_rag_papers_chain
|
4 |
-
from climateqa.engine.keywords import make_keywords_chain
|
5 |
-
from climateqa.sample_questions import QUESTIONS
|
6 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
7 |
from climateqa.engine.rag import make_rag_chain
|
8 |
from climateqa.engine.llm import get_llm
|
@@ -11,11 +10,9 @@ from datetime import datetime
|
|
11 |
import json
|
12 |
import re
|
13 |
import gradio as gr
|
14 |
-
from climateqa.papers.openalex import OpenAlex
|
15 |
from sentence_transformers import CrossEncoder
|
16 |
|
17 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
18 |
-
oa = OpenAlex()
|
19 |
|
20 |
# Load environment variables in local mode
|
21 |
try:
|
@@ -25,9 +22,9 @@ except Exception as e:
|
|
25 |
pass
|
26 |
|
27 |
# Set up Gradio Theme
|
28 |
-
theme = gr.themes.
|
29 |
-
primary_hue="
|
30 |
-
secondary_hue="
|
31 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
32 |
"system-ui", "sans-serif"],
|
33 |
)
|
@@ -163,7 +160,7 @@ async def chat(query, history):
|
|
163 |
"answer": history[-1][1],
|
164 |
"time": timestamp,
|
165 |
}
|
166 |
-
log_locally(log_file, logs)
|
167 |
|
168 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
169 |
|
@@ -181,7 +178,7 @@ def make_html_source(source, i):
|
|
181 |
<div class="card-content">
|
182 |
<div>
|
183 |
<div style="float:right;width 10%;position:relative;top:0px">
|
184 |
-
<a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
|
185 |
</div>
|
186 |
<div>
|
187 |
<h2>Extrait {i}</h2>
|
@@ -191,9 +188,9 @@ def make_html_source(source, i):
|
|
191 |
<p>{text_content}</p>
|
192 |
|
193 |
</div>
|
194 |
-
<div class="card-footer">
|
195 |
<span>{name}</span>
|
196 |
-
</div>
|
197 |
</div>
|
198 |
"""
|
199 |
|
@@ -209,79 +206,6 @@ def log_locally(file, logs):
|
|
209 |
f.write(logs_json)
|
210 |
|
211 |
|
212 |
-
def generate_keywords(query):
|
213 |
-
chain = make_keywords_chain(llm)
|
214 |
-
keywords = chain.invoke(query)
|
215 |
-
keywords = " AND ".join(keywords["keywords"])
|
216 |
-
return keywords
|
217 |
-
|
218 |
-
|
219 |
-
papers_cols_widths = {
|
220 |
-
"doc": 50,
|
221 |
-
"id": 100,
|
222 |
-
"title": 300,
|
223 |
-
"doi": 100,
|
224 |
-
"publication_year": 100,
|
225 |
-
"abstract": 500,
|
226 |
-
"rerank_score": 100,
|
227 |
-
"is_oa": 50,
|
228 |
-
}
|
229 |
-
|
230 |
-
papers_cols = list(papers_cols_widths.keys())
|
231 |
-
papers_cols_widths = list(papers_cols_widths.values())
|
232 |
-
|
233 |
-
|
234 |
-
async def find_papers(query, keywords, after):
|
235 |
-
|
236 |
-
summary = ""
|
237 |
-
|
238 |
-
df_works = oa.search(keywords, after=after)
|
239 |
-
df_works = df_works.dropna(subset=["abstract"])
|
240 |
-
df_works = oa.rerank(query, df_works, reranker)
|
241 |
-
df_works = df_works.sort_values("rerank_score", ascending=False)
|
242 |
-
G = oa.make_network(df_works)
|
243 |
-
|
244 |
-
height = "750px"
|
245 |
-
network = oa.show_network(
|
246 |
-
G, color_by="rerank_score", notebook=False, height=height)
|
247 |
-
network_html = network.generate_html()
|
248 |
-
|
249 |
-
network_html = network_html.replace("'", "\"")
|
250 |
-
css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
|
251 |
-
network_html = network_html + css_to_inject
|
252 |
-
|
253 |
-
network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
|
254 |
-
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
|
255 |
-
allow-scripts allow-same-origin allow-popups
|
256 |
-
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
|
257 |
-
allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
|
258 |
-
|
259 |
-
docs = df_works["content"].head(15).tolist()
|
260 |
-
|
261 |
-
df_works = df_works.reset_index(
|
262 |
-
drop=True).reset_index().rename(columns={"index": "doc"})
|
263 |
-
df_works["doc"] = df_works["doc"] + 1
|
264 |
-
df_works = df_works[papers_cols]
|
265 |
-
|
266 |
-
yield df_works, network_html, summary
|
267 |
-
|
268 |
-
chain = make_rag_papers_chain(llm)
|
269 |
-
result = chain.astream_log(
|
270 |
-
{"question": query, "docs": docs, "language": "English"})
|
271 |
-
path_answer = "/logs/StrOutputParser/streamed_output/-"
|
272 |
-
|
273 |
-
async for op in result:
|
274 |
-
|
275 |
-
op = op.ops[0]
|
276 |
-
|
277 |
-
if op['path'] == path_answer: # reforulated question
|
278 |
-
new_token = op['value'] # str
|
279 |
-
summary += new_token
|
280 |
-
else:
|
281 |
-
continue
|
282 |
-
yield df_works, network_html, summary
|
283 |
-
|
284 |
-
|
285 |
# --------------------------------------------------------------------
|
286 |
# Gradio
|
287 |
# --------------------------------------------------------------------
|
@@ -302,8 +226,13 @@ What would you like to know today?
|
|
302 |
"""
|
303 |
|
304 |
|
305 |
-
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
|
306 |
|
|
|
|
|
|
|
|
|
|
|
307 |
with gr.Tab("CLARA"):
|
308 |
|
309 |
with gr.Row(elem_id="chatbot-row"):
|
@@ -315,57 +244,44 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
315 |
|
316 |
with gr.Row(elem_id="input-message"):
|
317 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
318 |
-
|
319 |
|
320 |
-
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
321 |
|
322 |
-
|
323 |
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
docs_textbox = gr.State("")
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
|
333 |
-
|
334 |
-
|
|
|
|
|
|
|
335 |
|
336 |
-
with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
|
337 |
|
|
|
|
|
338 |
with gr.Row():
|
339 |
with gr.Column(scale=1):
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
|
344 |
-
after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
|
345 |
-
label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
|
346 |
-
search_papers = gr.Button(
|
347 |
-
"Search", elem_id="search-papers", interactive=True)
|
348 |
-
|
349 |
-
with gr.Column(scale=7):
|
350 |
-
|
351 |
-
with gr.Tab("Summary", elem_id="papers-summary-tab"):
|
352 |
-
papers_summary = gr.Markdown(
|
353 |
-
visible=True, elem_id="papers-summary")
|
354 |
|
355 |
-
with gr.Tab("Relevant papers", elem_id="papers-results-tab"):
|
356 |
-
papers_dataframe = gr.Dataframe(
|
357 |
-
visible=True, elem_id="papers-table", headers=papers_cols)
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
-
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
364 |
-
with gr.Row():
|
365 |
-
with gr.Column(scale=1):
|
366 |
-
gr.Markdown(
|
367 |
-
"CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
|
368 |
-
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
|
369 |
|
370 |
def start_chat(query, history):
|
371 |
history = history + [(query, None)]
|
@@ -382,21 +298,8 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
382 |
)
|
383 |
|
384 |
|
385 |
-
|
386 |
-
def change_sample_questions(key):
|
387 |
-
index = list(QUESTIONS.keys()).index(key)
|
388 |
-
visible_bools = [False] * len(samples)
|
389 |
-
visible_bools[index] = True
|
390 |
-
return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
|
391 |
-
|
392 |
-
# dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
|
393 |
-
|
394 |
-
query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
|
395 |
-
search_papers.click(find_papers, [query_papers, keywords_papers, after], [
|
396 |
-
papers_dataframe, citations_network, papers_summary])
|
397 |
-
|
398 |
demo.queue()
|
399 |
|
400 |
demo.launch(allowed_paths=["assets/download.png",
|
401 |
-
"assets/logo4.png"
|
402 |
-
favicon_path="assets/logo4.png")
|
|
|
1 |
+
|
2 |
+
|
3 |
# , get_pinecone_vectorstore, find_similar_vectors
|
4 |
+
from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP
|
|
|
|
|
|
|
5 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
6 |
from climateqa.engine.rag import make_rag_chain
|
7 |
from climateqa.engine.llm import get_llm
|
|
|
10 |
import json
|
11 |
import re
|
12 |
import gradio as gr
|
|
|
13 |
from sentence_transformers import CrossEncoder
|
14 |
|
15 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
|
|
16 |
|
17 |
# Load environment variables in local mode
|
18 |
try:
|
|
|
22 |
pass
|
23 |
|
24 |
# Set up Gradio Theme
|
25 |
+
theme = gr.themes.Soft(
|
26 |
+
primary_hue="yellow",
|
27 |
+
secondary_hue="orange",
|
28 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
29 |
"system-ui", "sans-serif"],
|
30 |
)
|
|
|
160 |
"answer": history[-1][1],
|
161 |
"time": timestamp,
|
162 |
}
|
163 |
+
#log_locally(log_file, logs)
|
164 |
|
165 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
166 |
|
|
|
178 |
<div class="card-content">
|
179 |
<div>
|
180 |
<div style="float:right;width 10%;position:relative;top:0px">
|
181 |
+
<a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
|
182 |
</div>
|
183 |
<div>
|
184 |
<h2>Extrait {i}</h2>
|
|
|
188 |
<p>{text_content}</p>
|
189 |
|
190 |
</div>
|
191 |
+
<!-- <div class="card-footer">
|
192 |
<span>{name}</span>
|
193 |
+
</div> -->
|
194 |
</div>
|
195 |
"""
|
196 |
|
|
|
206 |
f.write(logs_json)
|
207 |
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# --------------------------------------------------------------------
|
210 |
# Gradio
|
211 |
# --------------------------------------------------------------------
|
|
|
226 |
"""
|
227 |
|
228 |
|
229 |
+
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
|
230 |
|
231 |
+
gr.HTML("""
|
232 |
+
<img style="width:100px" src="file/assets/axionable.svg"/>
|
233 |
+
""", elem_classes="logo-axio ")
|
234 |
+
|
235 |
+
# TAB Clara
|
236 |
with gr.Tab("CLARA"):
|
237 |
|
238 |
with gr.Row(elem_id="chatbot-row"):
|
|
|
244 |
|
245 |
with gr.Row(elem_id="input-message"):
|
246 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
247 |
+
scale=7, lines=1, interactive=True, elem_id="input-textbox")
|
248 |
|
|
|
249 |
|
250 |
+
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
251 |
|
252 |
+
with gr.Column(scale=1, elem_id="tab-citations"):
|
253 |
+
|
254 |
+
gr.HTML("<p>Sources</p>")
|
|
|
255 |
|
256 |
+
sources_textbox = gr.HTML(
|
257 |
+
show_label=False, elem_id="sources-textbox")
|
258 |
+
docs_textbox = gr.State("")
|
259 |
|
260 |
+
# l'object tabs est necessaire actuellement
|
261 |
+
# J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
|
262 |
+
# pendant que l'ia gènère une reponse ..
|
263 |
+
with gr.Tabs() as tabs:
|
264 |
+
None
|
265 |
|
|
|
266 |
|
267 |
+
# TAB A propos
|
268 |
+
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
269 |
with gr.Row():
|
270 |
with gr.Column(scale=1):
|
271 |
+
gr.Markdown(
|
272 |
+
("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
|
273 |
+
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
|
|
|
|
|
|
275 |
|
276 |
+
# # TAB Configuration
|
277 |
+
# with gr.Tab("Configuration"):
|
278 |
+
#
|
279 |
+
# with gr.Row(elem_id="config-row"):
|
280 |
+
# with gr.Column(scale=1):
|
281 |
+
#
|
282 |
+
# for pdfName in get_PDF_Names_from_GCP():
|
283 |
+
# gr.Markdown( pdfName, elem_classes="a-propos")
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
def start_chat(query, history):
|
287 |
history = history + [(query, None)]
|
|
|
298 |
)
|
299 |
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
demo.queue()
|
302 |
|
303 |
demo.launch(allowed_paths=["assets/download.png",
|
304 |
+
"assets/logo4.png",
|
305 |
+
"assets/axionable.svg"],favicon_path="assets/logo4.png")
|
assets/axionable.svg
ADDED
|
climateqa/engine/embeddings.py
CHANGED
@@ -8,8 +8,12 @@ def get_embeddings_function(version = "v1.2"):
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
14 |
print("Loading embeddings model: ", model_name)
|
15 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
+
# model_name = "BAAI/bge-base-en-v1.5"
|
12 |
+
|
13 |
+
# https://huggingface.co/BAAI/bge-m3
|
14 |
+
# A better one from 2024-04
|
15 |
+
model_name = "BAAI/bge-m3"
|
16 |
+
|
17 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
18 |
print("Loading embeddings model: ", model_name)
|
19 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
climateqa/engine/vectorstore.py
CHANGED
@@ -1,98 +1,94 @@
|
|
1 |
-
# Pinecone
|
2 |
-
# More info at https://docs.pinecone.io/docs/langchain
|
3 |
-
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
4 |
-
#import os
|
5 |
-
#from pinecone import Pinecone
|
6 |
-
#from langchain_community.vectorstores import Pinecone as PineconeVectorstore
|
7 |
|
8 |
-
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
#except:
|
13 |
-
# pass
|
14 |
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
17 |
|
18 |
-
# # initialize pinecone
|
19 |
-
# pinecone.init(
|
20 |
-
# api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
|
21 |
-
# environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
|
22 |
-
# )
|
23 |
|
24 |
-
# index_name = os.getenv("PINECONE_API_INDEX")
|
25 |
-
# vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
|
26 |
|
27 |
-
|
|
|
28 |
|
29 |
-
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
30 |
-
# index = pc.Index(os.getenv("PINECONE_API_INDEX"))
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
# )
|
35 |
-
# return vectorstore
|
36 |
|
|
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
40 |
|
41 |
-
#
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# }
|
47 |
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
from langchain_text_splitters import CharacterTextSplitter
|
59 |
-
from climateqa.engine.embeddings import get_embeddings_function
|
60 |
-
embeddings_function = get_embeddings_function()
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
-
|
69 |
-
|
70 |
|
71 |
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
#
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
87 |
-
for pdf_page in pdf.pages:
|
88 |
-
f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
|
89 |
-
# f.write(pdf_file+" page "+str(pdf_page.page_number))
|
90 |
-
for char_pdf in pdf_page.chars:
|
91 |
-
f.write(char_pdf["text"])
|
92 |
-
f.close()
|
93 |
|
94 |
docs = []
|
95 |
-
vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
|
96 |
for filename in os.listdir(folder_path):
|
97 |
if filename.startswith("."):
|
98 |
continue
|
@@ -103,12 +99,17 @@ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vec
|
|
103 |
|
104 |
for doc in documents:
|
105 |
if (doc.metadata):
|
106 |
-
doc.metadata["ax_page"] = doc.metadata['source'].split("
|
107 |
-
doc.metadata["ax_name"] = doc.metadata['source'].split("
|
108 |
doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
109 |
|
110 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
111 |
docs += text_splitter.split_documents(documents)
|
112 |
-
vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
from google.cloud import storage
|
3 |
+
#storage_client = storage.Client()
|
4 |
+
storage_client = storage.Client.create_anonymous_client()
|
5 |
+
bucket_name = "docs-axio-clara"
|
|
|
|
|
6 |
|
7 |
+
from langchain_pinecone import PineconeVectorStore
|
8 |
|
9 |
+
from langchain_community.document_loaders import TextLoader
|
10 |
+
from langchain_text_splitters import CharacterTextSplitter
|
11 |
+
from climateqa.engine.embeddings import get_embeddings_function
|
12 |
+
embeddings_function = get_embeddings_function()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
|
|
|
15 |
|
16 |
+
index_name = "my-index"
|
17 |
+
namespace = "my-namespace"
|
18 |
|
|
|
|
|
19 |
|
20 |
+
import os
|
21 |
+
import pdfplumber
|
|
|
|
|
22 |
|
23 |
+
def get_PDF_Names_from_GCP():
|
24 |
|
25 |
+
listName = []
|
26 |
+
# Récupération des fichier depuis GCP storage
|
27 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
28 |
+
for blob in blobs:
|
29 |
+
listName.append(blob.name)
|
30 |
+
return listName
|
31 |
|
32 |
+
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
|
33 |
|
34 |
+
# Récupération des fichier depuis GCP storage
|
35 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
36 |
+
for blob in blobs:
|
37 |
|
38 |
+
print( "\n"+blob.name+":")
|
39 |
+
print( " <- Téléchargement Depuis GCP")
|
40 |
+
blob.download_to_filename(pdf_folder+"/"+blob.name)
|
|
|
41 |
|
42 |
+
# Extraction des textes dpuis les fichiers PDF
|
43 |
+
print(" >>> Extraction PDF")
|
44 |
+
for pdf_file in os.listdir(pdf_folder):
|
45 |
+
if pdf_file.startswith("."):
|
46 |
+
continue
|
47 |
+
print(" > "+pdf_folder+"/"+pdf_file)
|
48 |
+
pdf_total_pages = 0
|
49 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
50 |
+
pdf_total_pages = len(pdf.pages)
|
51 |
+
|
52 |
+
# Fuite mémoire pour les gros fichiers
|
53 |
+
# Reouvrir le fichier à chaque N page semble rélgler le problème
|
54 |
+
N_page = 300
|
55 |
+
page_number = 0
|
56 |
+
while page_number < pdf_total_pages:
|
57 |
|
58 |
+
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
|
59 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
60 |
|
61 |
+
npage = 0
|
62 |
+
while (npage < N_page and page_number < pdf_total_pages) :
|
|
|
|
|
|
|
63 |
|
64 |
+
print(" >>> "+str(page_number+1))
|
65 |
+
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
|
66 |
+
for char_pdf in pdf.pages[page_number].chars:
|
67 |
+
f.write(char_pdf["text"])
|
68 |
+
f.close()
|
69 |
|
70 |
+
npage = npage + 1
|
71 |
+
page_number = page_number + 1
|
72 |
|
73 |
|
74 |
+
print(" X removing: " + blob.name )
|
75 |
+
os.remove(pdf_folder+"/"+blob.name)
|
76 |
+
|
77 |
|
78 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
79 |
+
|
80 |
+
vectorstore = PineconeVectorStore(
|
81 |
+
index_name=index_name,
|
82 |
+
embedding=embeddings_function,
|
83 |
+
#namespace=namespace
|
84 |
+
)
|
85 |
+
|
86 |
+
return vectorstore
|
87 |
+
|
88 |
+
print(" Vectorisation ...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
docs = []
|
91 |
+
#vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
|
92 |
for filename in os.listdir(folder_path):
|
93 |
if filename.startswith("."):
|
94 |
continue
|
|
|
99 |
|
100 |
for doc in documents:
|
101 |
if (doc.metadata):
|
102 |
+
doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
|
103 |
+
doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
|
104 |
doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
105 |
|
106 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
107 |
docs += text_splitter.split_documents(documents)
|
108 |
+
#vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
|
109 |
+
vectorstore = PineconeVectorStore.from_documents(docs, embeddings_function, index_name=index_name)
|
110 |
+
#vector_store_from_docs.save_local(vectors_path)
|
111 |
+
return vectorstore
|
112 |
+
|
113 |
+
|
114 |
+
print("MISSING VECTORS")
|
115 |
+
exit(0)
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
|
|
1 |
gradio==4.19.1
|
2 |
-
gunicorn==22.0.0
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
@@ -10,5 +10,4 @@ msal
|
|
10 |
pyalex==0.13
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
-
annoy==1.17.3
|
14 |
-
pdfplumber
|
|
|
1 |
+
google-cloud-storage==2.16.0
|
2 |
gradio==4.19.1
|
|
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
|
|
10 |
pyalex==0.13
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
+
annoy==1.17.3
|
|
sources/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be9d2d29a6545fc1949b10eb8428e6fac632aa84020fa61f4f76600817a21cd5
|
3 |
+
size 2079496
|
style.css
CHANGED
@@ -3,6 +3,78 @@
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
.telecharger {
|
7 |
border: 1px solid;
|
8 |
padding: 5px;
|
@@ -43,7 +115,7 @@ body.dark .warning-box * {
|
|
43 |
|
44 |
|
45 |
body.dark .tip-box * {
|
46 |
-
color:
|
47 |
}
|
48 |
|
49 |
|
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
6 |
+
.logo-axio {
|
7 |
+
float: right;
|
8 |
+
position: absolute;
|
9 |
+
right: 0px;
|
10 |
+
}
|
11 |
+
|
12 |
+
|
13 |
+
/* couleur text */
|
14 |
+
p {
|
15 |
+
color: black !important;
|
16 |
+
}
|
17 |
+
li {
|
18 |
+
color: black !important;
|
19 |
+
}
|
20 |
+
|
21 |
+
button.selected {
|
22 |
+
border-radius: 20px !important;
|
23 |
+
}
|
24 |
+
button:hover {
|
25 |
+
color: #ffc000 !important;
|
26 |
+
}
|
27 |
+
|
28 |
+
|
29 |
+
/* fond panels/blocks */
|
30 |
+
.panel {
|
31 |
+
background-color: #eeeeee !important;
|
32 |
+
border: 0px;
|
33 |
+
}
|
34 |
+
.block {
|
35 |
+
background-color: #eeeeee !important;
|
36 |
+
}
|
37 |
+
|
38 |
+
/* fond bot */
|
39 |
+
.bot {
|
40 |
+
background-color: #eeeeee !important;
|
41 |
+
}
|
42 |
+
|
43 |
+
/* avatar en debut de reponse */
|
44 |
+
.avatar-container {
|
45 |
+
align-self: baseline !important;
|
46 |
+
margin-top: 35px;
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
/* fond user */
|
52 |
+
.user {
|
53 |
+
background-color: #d2d2d2 !important;
|
54 |
+
}
|
55 |
+
textarea {
|
56 |
+
background-color: #d2d2d2 !important;
|
57 |
+
color: black !important;
|
58 |
+
}
|
59 |
+
|
60 |
+
|
61 |
+
/* fond app */
|
62 |
+
gradio-app {
|
63 |
+
background-color: #ffffff !important;
|
64 |
+
}
|
65 |
+
.gradio-container {
|
66 |
+
background-color: #ffffff !important;
|
67 |
+
max-width: 100% !important;
|
68 |
+
width: 100% !important;
|
69 |
+
}
|
70 |
+
|
71 |
+
|
72 |
+
.a-propos {
|
73 |
+
margin: 20px !important;
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
.telecharger {
|
79 |
border: 1px solid;
|
80 |
padding: 5px;
|
|
|
115 |
|
116 |
|
117 |
body.dark .tip-box * {
|
118 |
+
color:rgb(216, 216, 216) !important;
|
119 |
}
|
120 |
|
121 |
|
test
CHANGED
@@ -19,8 +19,7 @@ ENV HOME=/home/user \
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
-
SYSTEM=spaces
|
23 |
-
PORT=7860
|
24 |
|
25 |
# Set the working directory to the user's home directory
|
26 |
WORKDIR $HOME/app
|
@@ -28,8 +27,6 @@ WORKDIR $HOME/app
|
|
28 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
29 |
COPY --chown=user . $HOME/app
|
30 |
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
CMD gunicorn -b 0.0.0.0:$PORT app:demo
|
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces
|
|
|
23 |
|
24 |
# Set the working directory to the user's home directory
|
25 |
WORKDIR $HOME/app
|
|
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
29 |
|
30 |
+
CMD ["python","setup.py"]
|
31 |
|
32 |
+
CMD ["python", "app.py"]
|
|
|
|