|
|
|
import re |
|
from collections import defaultdict |
|
from climateqa.utils import get_image_from_azure_blob_storage |
|
from climateqa.engine.chains.prompts import audience_prompts |
|
from PIL import Image |
|
from io import BytesIO |
|
import base64 |
|
|
|
|
|
def make_pairs(lst:list)->list: |
|
"""from a list of even lenght, make tupple pairs""" |
|
return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] |
|
|
|
|
|
def serialize_docs(docs:list)->list: |
|
new_docs = [] |
|
for doc in docs: |
|
new_doc = {} |
|
new_doc["page_content"] = doc.page_content |
|
new_doc["metadata"] = doc.metadata |
|
new_docs.append(new_doc) |
|
return new_docs |
|
|
|
|
|
|
|
def parse_output_llm_with_sources(output:str)->str: |
|
|
|
content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output) |
|
parts = [] |
|
for part in content_parts: |
|
if part.startswith("Doc"): |
|
subparts = part.split(",") |
|
subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts] |
|
subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts] |
|
parts.append("".join(subparts)) |
|
else: |
|
parts.append(part) |
|
content_parts = "".join(parts) |
|
return content_parts |
|
|
|
def process_figures(docs:list)->tuple: |
|
gallery=[] |
|
used_figures =[] |
|
figures = '<div class="figures-container"><p></p> </div>' |
|
docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"] |
|
for i, doc in enumerate(docs_figures): |
|
if doc.metadata["chunk_type"] == "image": |
|
if doc.metadata["figure_code"] != "N/A": |
|
title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}" |
|
else: |
|
title = f"{doc.metadata['short_name']}" |
|
|
|
|
|
if title not in used_figures: |
|
used_figures.append(title) |
|
try: |
|
key = f"Image {i+1}" |
|
|
|
image_path = doc.metadata["image_path"].split("documents/")[1] |
|
img = get_image_from_azure_blob_storage(image_path) |
|
|
|
|
|
buffered = BytesIO() |
|
max_image_length = 500 |
|
img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0]))) |
|
img_resized.save(buffered, format="PNG") |
|
|
|
img_str = base64.b64encode(buffered.getvalue()).decode() |
|
|
|
figures = figures + make_html_figure_sources(doc, i, img_str) |
|
gallery.append(img) |
|
except Exception as e: |
|
print(f"Skipped adding image {i} because of {e}") |
|
|
|
return figures, gallery |
|
|
|
|
|
def generate_html_graphs(graphs:list)->str: |
|
|
|
categories = defaultdict(list) |
|
for graph in graphs: |
|
category = graph['metadata']['category'] |
|
categories[category].append(graph['embedding']) |
|
|
|
|
|
html_code = ''' |
|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Graphs by Category</title> |
|
<style> |
|
.tab-content { |
|
display: none; |
|
} |
|
.tab-content.active { |
|
display: block; |
|
} |
|
.tabs { |
|
margin-bottom: 20px; |
|
} |
|
.tab-button { |
|
background-color: #ddd; |
|
border: none; |
|
padding: 10px 20px; |
|
cursor: pointer; |
|
margin-right: 5px; |
|
} |
|
.tab-button.active { |
|
background-color: #ccc; |
|
} |
|
</style> |
|
<script> |
|
function showTab(tabId) { |
|
var contents = document.getElementsByClassName('tab-content'); |
|
var buttons = document.getElementsByClassName('tab-button'); |
|
for (var i = 0; i < contents.length; i++) { |
|
contents[i].classList.remove('active'); |
|
buttons[i].classList.remove('active'); |
|
} |
|
document.getElementById(tabId).classList.add('active'); |
|
document.querySelector('button[data-tab="'+tabId+'"]').classList.add('active'); |
|
} |
|
</script> |
|
</head> |
|
<body> |
|
<div class="tabs"> |
|
''' |
|
|
|
|
|
for i, category in enumerate(categories.keys()): |
|
active_class = 'active' if i == 0 else '' |
|
html_code += f'<button class="tab-button {active_class}" onclick="showTab(\'tab-{i}\')" data-tab="tab-{i}">{category}</button>' |
|
|
|
html_code += '</div>' |
|
|
|
|
|
for i, (category, embeds) in enumerate(categories.items()): |
|
active_class = 'active' if i == 0 else '' |
|
html_code += f'<div id="tab-{i}" class="tab-content {active_class}">' |
|
for embed in embeds: |
|
html_code += embed |
|
html_code += '</div>' |
|
|
|
html_code += ''' |
|
</body> |
|
</html> |
|
''' |
|
|
|
return html_code |
|
|
|
|
|
|
|
def make_html_source(source,i): |
|
meta = source.metadata |
|
|
|
content = source.page_content.strip() |
|
|
|
toc_levels = [] |
|
for j in range(2): |
|
level = meta[f"toc_level{j}"] |
|
if level != "N/A": |
|
toc_levels.append(level) |
|
else: |
|
break |
|
toc_levels = " > ".join(toc_levels) |
|
|
|
if len(toc_levels) > 0: |
|
name = f"<b>{toc_levels}</b><br/>{meta['name']}" |
|
else: |
|
name = meta['name'] |
|
|
|
score = meta['reranking_score'] |
|
if score > 0.8: |
|
color = "score-green" |
|
elif score > 0.5: |
|
color = "score-orange" |
|
else: |
|
color = "score-red" |
|
|
|
relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>" |
|
|
|
if meta["chunk_type"] == "text": |
|
|
|
card = f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2> |
|
<p>{content}</p> |
|
{relevancy_score} |
|
</div> |
|
<div class="card-footer"> |
|
<span>{name}</span> |
|
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link"> |
|
<span role="img" aria-label="Open PDF">π</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
|
|
else: |
|
|
|
if meta["figure_code"] != "N/A": |
|
title = f"{meta['figure_code']} - {meta['short_name']}" |
|
else: |
|
title = f"{meta['short_name']}" |
|
|
|
card = f""" |
|
<div class="card card-image"> |
|
<div class="card-content"> |
|
<h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2> |
|
<p class='ai-generated'>AI-generated description</p> |
|
<p>{content}</p> |
|
|
|
{relevancy_score} |
|
</div> |
|
<div class="card-footer"> |
|
<span>{name}</span> |
|
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link"> |
|
<span role="img" aria-label="Open PDF">π</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
|
|
return card |
|
|
|
|
|
def make_html_papers(df,i): |
|
title = df['title'][i] |
|
content = df['abstract'][i] |
|
url = df['doi'][i] |
|
publication_date = df['publication_year'][i] |
|
subtitle = df['subtitle'][i] |
|
|
|
card = f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i+1} - {title}</h2> |
|
<p>{content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{subtitle}</span> |
|
<a href="{url}" target="_blank" class="pdf-link"> |
|
<span role="img" aria-label="Open paper">π</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
|
|
return card |
|
|
|
|
|
def make_html_figure_sources(source,i,img_str): |
|
meta = source.metadata |
|
content = source.page_content.strip() |
|
|
|
score = meta['reranking_score'] |
|
if score > 0.8: |
|
color = "score-green" |
|
elif score > 0.5: |
|
color = "score-orange" |
|
else: |
|
color = "score-red" |
|
|
|
toc_levels = [] |
|
if len(toc_levels) > 0: |
|
name = f"<b>{toc_levels}</b><br/>{meta['name']}" |
|
else: |
|
name = meta['name'] |
|
|
|
relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>" |
|
|
|
if meta["figure_code"] != "N/A": |
|
title = f"{meta['figure_code']} - {meta['short_name']}" |
|
else: |
|
title = f"{meta['short_name']}" |
|
|
|
card = f""" |
|
<div class="card card-image"> |
|
<div class="card-content"> |
|
<h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2> |
|
<img src="data:image/png;base64, { img_str }" alt="Alt text" /> |
|
<p class='ai-generated'>AI-generated description</p> |
|
|
|
<p>{content}</p> |
|
|
|
{relevancy_score} |
|
</div> |
|
<div class="card-footer"> |
|
<span>{name}</span> |
|
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link"> |
|
<span role="img" aria-label="Open PDF">π</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
return card |
|
|
|
|
|
|
|
def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"): |
|
|
|
if checked: |
|
span = "<span class='checkmark'>✓</span>" |
|
else: |
|
span = "<span class='loader'></span>" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
toolbox = f""" |
|
<div class="dropdown"> |
|
<label for="{elem_id}" class="dropdown-toggle"> |
|
{span} |
|
{tool_name} |
|
</label> |
|
</div> |
|
""" |
|
|
|
return toolbox |
|
|