import re
from collections import defaultdict
from climateqa.utils import get_image_from_azure_blob_storage
from climateqa.engine.chains.prompts import audience_prompts
from PIL import Image
from io import BytesIO
import base64
def make_pairs(lst):
"""from a list of even lenght, make tupple pairs"""
return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
def serialize_docs(docs):
new_docs = []
for doc in docs:
new_doc = {}
new_doc["page_content"] = doc.page_content
new_doc["metadata"] = doc.metadata
new_docs.append(new_doc)
return new_docs
def parse_output_llm_with_sources(output):
# Split the content into a list of text and "[Doc X]" references
content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
parts = []
for part in content_parts:
if part.startswith("Doc"):
subparts = part.split(",")
subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
subparts = [f"""{subpart}""" for subpart in subparts]
parts.append("".join(subparts))
else:
parts.append(part)
content_parts = "".join(parts)
return content_parts
def process_figures(docs):
gallery=[]
used_figures =[]
figures = '
'
docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
for i, doc in enumerate(docs_figures):
if doc.metadata["chunk_type"] == "image":
if doc.metadata["figure_code"] != "N/A":
title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
else:
title = f"{doc.metadata['short_name']}"
if title not in used_figures:
used_figures.append(title)
try:
key = f"Image {i+1}"
image_path = doc.metadata["image_path"].split("documents/")[1]
img = get_image_from_azure_blob_storage(image_path)
# Convert the image to a byte buffer
buffered = BytesIO()
max_image_length = 500
img_resized = img.resize((max_image_length, int(max_image_length * img.size[1]/img.size[0])))
img_resized.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
figures = figures + make_html_figure_sources(doc, i, img_str)
gallery.append(img)
except Exception as e:
print(f"Skipped adding image {i} because of {e}")
return figures, gallery
def generate_html_graphs(graphs):
# Organize graphs by category
categories = defaultdict(list)
for graph in graphs:
category = graph['metadata']['category']
categories[category].append(graph['embedding'])
# Begin constructing the HTML
html_code = '''
Graphs by Category
'''
# Add buttons for each category
for i, category in enumerate(categories.keys()):
active_class = 'active' if i == 0 else ''
html_code += f''
html_code += '
'
# Add content for each category
for i, (category, embeds) in enumerate(categories.items()):
active_class = 'active' if i == 0 else ''
html_code += f'
'
for embed in embeds:
html_code += embed
html_code += '
'
html_code += '''
'''
return html_code
def make_html_source(source,i):
meta = source.metadata
# content = source.page_content.split(":",1)[1].strip()
content = source.page_content.strip()
toc_levels = []
for j in range(2):
level = meta[f"toc_level{j}"]
if level != "N/A":
toc_levels.append(level)
else:
break
toc_levels = " > ".join(toc_levels)
if len(toc_levels) > 0:
name = f"{toc_levels} {meta['name']}"
else:
name = meta['name']
score = meta['reranking_score']
if score > 0.8:
color = "score-green"
elif score > 0.5:
color = "score-orange"
else:
color = "score-red"
relevancy_score = f"