|
|
|
import json |
|
import string |
|
|
|
import wikipedia |
|
from langchain import PromptTemplate |
|
from langchain.vectorstores import Chroma |
|
from langchain.text_splitter import CharacterTextSplitter |
|
|
|
from src.tools.llms import openai_llm |
|
from src.tools.wiki import Wiki |
|
|
|
|
|
|
|
|
|
def get_wikilist(task: {}) -> str: |
|
""" |
|
get the titles of wiki pages interesting for solving the given task |
|
""" |
|
|
|
llm = openai_llm |
|
template = (f"\n" |
|
f" Your task consists in finding the list of wikipedia page titles which provide useful content " |
|
f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n" |
|
f" \n" |
|
f" The paragraph belongs at the top level of the hierarchy to a document" |
|
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
|
f" Make sure that the paragraph relates the top level of the document\n" |
|
f" \n" |
|
f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
|
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
|
f" \n" |
|
f" The paragraphs comes after previous paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
|
f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
|
f" \n" |
|
f" The paragraphs comes before next paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
|
f" \n" |
|
f" Format your response as a JSON list of strings separated by commas.\n" |
|
f" \n" |
|
f"\n" |
|
f" ") |
|
|
|
prompt = PromptTemplate( |
|
input_variables=[], |
|
template=template |
|
) |
|
|
|
|
|
llm_list = llm(template) |
|
wikilist = extract_list(llm_list) |
|
|
|
expanded_wikilist = [] |
|
|
|
expand_factor = 2 |
|
|
|
for wikipage in wikilist: |
|
expanded_wikilist += wikipedia.search(wikipage, expand_factor) |
|
|
|
wikilist = list(set(expanded_wikilist)) |
|
|
|
return wikilist |
|
|
|
|
|
def extract_list(llm_list: str): |
|
print(llm_list) |
|
|
|
def filter_(el: str): |
|
resp = 2 < len(el) |
|
usable_length = len([c for c in el if c in string.ascii_letters]) |
|
resp = resp and len(el)*3/4 < usable_length |
|
return resp |
|
|
|
try: |
|
wikilist = llm_list[1:-1].split('"') |
|
wikilist = [el for el in wikilist if filter_(el)] |
|
print(wikilist) |
|
except: |
|
wikilist = [] |
|
print('issues with the wikilist') |
|
return wikilist |
|
|
|
|
|
def get_public_paragraph(task: {}) -> str: |
|
"""returns the task directly performed by chat GPT""" |
|
|
|
llm = openai_llm |
|
template = (f"\n" |
|
f" Your task consists in generating a paragraph\\n" |
|
f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
|
f"\n" |
|
f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
|
f" Make sure that the paragraph relates the top level of the document\n" |
|
f" \n" |
|
f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
|
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
|
f" \n" |
|
f" The paragraphs comes after previous paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
|
f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
|
f" \n" |
|
f" The paragraphs comes before next paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
|
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
|
f" \n" |
|
f" \n" |
|
f"\n" |
|
f" ") |
|
|
|
p = llm(template) |
|
|
|
return p |
|
|
|
|
|
def create_index(wikilist: [str]): |
|
""" |
|
useful for creating the index of wikipages |
|
""" |
|
fetch = Wiki().fetch |
|
|
|
pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str] |
|
texts = [] |
|
chunk = 800 |
|
for title, page in pages: |
|
texts.append(WikiPage(title=title, fulltext=page.page_content)) |
|
|
|
doc_splitter = CharacterTextSplitter( |
|
separator=".", |
|
chunk_size=chunk, |
|
chunk_overlap=100, |
|
length_function=len, |
|
) |
|
|
|
paragraphs = texts[0].get_paragraphs(chunk=800) |
|
|
|
split_texts = [] |
|
for p in paragraphs: |
|
split_texts += doc_splitter.split_text(p) |
|
|
|
for split_text in split_texts: |
|
assert type(split_text) == str |
|
assert 0 < len(split_text) < 2 * 500 |
|
|
|
wiki_index = Chroma.from_texts(split_texts) |
|
|
|
return wiki_index |
|
|
|
|
|
def get_wiki_paragraph(wiki_index, task: {}) -> str: |
|
"""useful to get a summary in one line from wiki index""" |
|
|
|
task_description = get_public_paragraph(task) |
|
wiki_paragraphs = semantic_search(wiki_index, task_description) |
|
text_content = "" |
|
for p in wiki_paragraphs: |
|
text_content += p.page_content + "/n/n" |
|
|
|
template = (f"\n" |
|
f" Your task consists in generating a paragraph\\n" |
|
f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
|
f"\n" |
|
f" The text generation is based in the documents provided in these sections \n" |
|
f" delimited by by triple backticks: ``` {text_content}``` \n" |
|
f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
|
f" Make sure that the paragraph relates the top level of the document\n" |
|
f" \n" |
|
f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
|
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
|
f" \n" |
|
f" The paragraphs comes after previous paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
|
f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
|
f" \n" |
|
f" The paragraphs comes before next paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
|
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
|
f" \n" |
|
f" \n" |
|
f"\n" |
|
f" ") |
|
|
|
llm = openai_llm |
|
p = llm(template) |
|
|
|
return p |
|
|
|
|
|
def get_private_paragraph(texts, task: {}) -> str: |
|
"""useful to get a summary in one line from wiki index""" |
|
|
|
text_content = "" |
|
for t in texts: |
|
text_content += t + "/n/n" |
|
|
|
template = (f"\n" |
|
f" Your task consists in generating a paragraph\\n" |
|
f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
|
f"\n" |
|
f" The text generation is based in the documents provided in these sections \n" |
|
f" delimited by by triple backticks: ``` {text_content}``` \n" |
|
f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
|
f" Make sure that the paragraph relates the top level of the document\n" |
|
f" \n" |
|
f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
|
f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
|
f" \n" |
|
f" The paragraphs comes after previous paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
|
f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
|
f" \n" |
|
f" The paragraphs comes before next paragraphs \\n" |
|
f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
|
f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
|
f" \n" |
|
f" \n" |
|
f"\n" |
|
f" ") |
|
|
|
llm = openai_llm |
|
p = llm(template) |
|
|
|
return p |
|
|