timeki's picture
Add content recommandation (#17)
bcc8503 verified
import re
from collections import defaultdict
from climateqa.utils import get_image_from_azure_blob_storage
from climateqa.engine.chains.prompts import audience_prompts
from PIL import Image
from io import BytesIO
import base64
def make_pairs(lst:list)->list:
"""from a list of even lenght, make tupple pairs"""
return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
def serialize_docs(docs:list)->list:
new_docs = []
for doc in docs:
new_doc = {}
new_doc["page_content"] = doc.page_content
new_doc["metadata"] = doc.metadata
new_docs.append(new_doc)
return new_docs
def parse_output_llm_with_sources(output:str)->str:
# Split the content into a list of text and "[Doc X]" references
content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
parts = []
for part in content_parts:
if part.startswith("Doc"):
subparts = part.split(",")
subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
parts.append("".join(subparts))
else:
parts.append(part)
content_parts = "".join(parts)
return content_parts
def process_figures(docs:list)->tuple:
gallery=[]
used_figures =[]
figures = '<div class="figures-container"><p></p> </div>'
docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
for i, doc in enumerate(docs_figures):
if doc.metadata["chunk_type"] == "image":
if doc.metadata["figure_code"] != "N/A":
title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
else:
title = f"{doc.metadata['short_name']}"
if title not in used_figures:
used_figures.append(title)
try:
key = f"Image {i+1}"
image_path = doc.metadata["image_path"].split("documents/")[1]
img = get_image_from_azure_blob_storage(image_path)
# Convert the image to a byte buffer
buffered = BytesIO()
max_image_length = 500
img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0])))
img_resized.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
figures = figures + make_html_figure_sources(doc, i, img_str)
gallery.append(img)
except Exception as e:
print(f"Skipped adding image {i} because of {e}")
return figures, gallery
def generate_html_graphs(graphs:list)->str:
# Organize graphs by category
categories = defaultdict(list)
for graph in graphs:
category = graph['metadata']['category']
categories[category].append(graph['embedding'])
# Begin constructing the HTML
html_code = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Graphs by Category</title>
<style>
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.tabs {
margin-bottom: 20px;
}
.tab-button {
background-color: #ddd;
border: none;
padding: 10px 20px;
cursor: pointer;
margin-right: 5px;
}
.tab-button.active {
background-color: #ccc;
}
</style>
<script>
function showTab(tabId) {
var contents = document.getElementsByClassName('tab-content');
var buttons = document.getElementsByClassName('tab-button');
for (var i = 0; i < contents.length; i++) {
contents[i].classList.remove('active');
buttons[i].classList.remove('active');
}
document.getElementById(tabId).classList.add('active');
document.querySelector('button[data-tab="'+tabId+'"]').classList.add('active');
}
</script>
</head>
<body>
<div class="tabs">
'''
# Add buttons for each category
for i, category in enumerate(categories.keys()):
active_class = 'active' if i == 0 else ''
html_code += f'<button class="tab-button {active_class}" onclick="showTab(\'tab-{i}\')" data-tab="tab-{i}">{category}</button>'
html_code += '</div>'
# Add content for each category
for i, (category, embeds) in enumerate(categories.items()):
active_class = 'active' if i == 0 else ''
html_code += f'<div id="tab-{i}" class="tab-content {active_class}">'
for embed in embeds:
html_code += embed
html_code += '</div>'
html_code += '''
</body>
</html>
'''
return html_code
def make_html_source(source,i):
meta = source.metadata
# content = source.page_content.split(":",1)[1].strip()
content = source.page_content.strip()
toc_levels = []
for j in range(2):
level = meta[f"toc_level{j}"]
if level != "N/A":
toc_levels.append(level)
else:
break
toc_levels = " > ".join(toc_levels)
if len(toc_levels) > 0:
name = f"<b>{toc_levels}</b><br/>{meta['name']}"
else:
name = meta['name']
score = meta['reranking_score']
if score > 0.8:
color = "score-green"
elif score > 0.5:
color = "score-orange"
else:
color = "score-red"
relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
if meta["chunk_type"] == "text":
card = f"""
<div class="card" id="doc{i}">
<div class="card-content">
<h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
<p>{content}</p>
{relevancy_score}
</div>
<div class="card-footer">
<span>{name}</span>
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
<span role="img" aria-label="Open PDF">πŸ”—</span>
</a>
</div>
</div>
"""
else:
if meta["figure_code"] != "N/A":
title = f"{meta['figure_code']} - {meta['short_name']}"
else:
title = f"{meta['short_name']}"
card = f"""
<div class="card card-image">
<div class="card-content">
<h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
<p class='ai-generated'>AI-generated description</p>
<p>{content}</p>
{relevancy_score}
</div>
<div class="card-footer">
<span>{name}</span>
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
<span role="img" aria-label="Open PDF">πŸ”—</span>
</a>
</div>
</div>
"""
return card
def make_html_papers(df,i):
title = df['title'][i]
content = df['abstract'][i]
url = df['doi'][i]
publication_date = df['publication_year'][i]
subtitle = df['subtitle'][i]
card = f"""
<div class="card" id="doc{i}">
<div class="card-content">
<h2>Doc {i+1} - {title}</h2>
<p>{content}</p>
</div>
<div class="card-footer">
<span>{subtitle}</span>
<a href="{url}" target="_blank" class="pdf-link">
<span role="img" aria-label="Open paper">πŸ”—</span>
</a>
</div>
</div>
"""
return card
def make_html_figure_sources(source,i,img_str):
meta = source.metadata
content = source.page_content.strip()
score = meta['reranking_score']
if score > 0.8:
color = "score-green"
elif score > 0.5:
color = "score-orange"
else:
color = "score-red"
toc_levels = []
if len(toc_levels) > 0:
name = f"<b>{toc_levels}</b><br/>{meta['name']}"
else:
name = meta['name']
relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
if meta["figure_code"] != "N/A":
title = f"{meta['figure_code']} - {meta['short_name']}"
else:
title = f"{meta['short_name']}"
card = f"""
<div class="card card-image">
<div class="card-content">
<h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
<img src="data:image/png;base64, { img_str }" alt="Alt text" />
<p class='ai-generated'>AI-generated description</p>
<p>{content}</p>
{relevancy_score}
</div>
<div class="card-footer">
<span>{name}</span>
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
<span role="img" aria-label="Open PDF">πŸ”—</span>
</a>
</div>
</div>
"""
return card
def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
if checked:
span = "<span class='checkmark'>&#10003;</span>"
else:
span = "<span class='loader'></span>"
# toolbox = f"""
# <div class="dropdown">
# <label for="{elem_id}" class="dropdown-toggle">
# {span}
# {tool_name}
# <span class="caret"></span>
# </label>
# <input type="checkbox" id="{elem_id}" hidden/>
# <div class="dropdown-content">
# <p>{description}</p>
# </div>
# </div>
# """
toolbox = f"""
<div class="dropdown">
<label for="{elem_id}" class="dropdown-toggle">
{span}
{tool_name}
</label>
</div>
"""
return toolbox