|
import os.path |
|
import openai |
|
from openai import OpenAI |
|
import os |
|
from groq import Groq |
|
import requests |
|
import time |
|
from html.parser import HTMLParser |
|
from bs4 import BeautifulSoup |
|
import json |
|
from datetime import datetime |
|
import pandas as pd |
|
from serpapi import GoogleSearch |
|
import gradio as gr |
|
|
|
GROQ_API_KEY=getenv("GROQ_API_KEY") |
|
client_groq = Groq(api_key=GROQ_API_KEY,) |
|
openai_key=getenv("OPENAI_API_KEY") |
|
os.environ["OPENAI_API_KEY"] = openai_key |
|
client = OpenAI() |
|
|
|
SERPAPI_KEY=getenv("SERPAPI_KEY") |
|
|
|
api_key = os.getenv("API_KEY") |
|
|
|
def scrape_website(url): |
|
headers = {'User-Agent': 'Mozilla/5.0'} |
|
try: |
|
response = requests.get(url, headers=headers, timeout=20) |
|
response.encoding = response.apparent_encoding |
|
|
|
if response.status_code == 200: |
|
page_content = response.content |
|
soup = BeautifulSoup(page_content, 'html.parser') |
|
paragraphs = soup.find_all('p') |
|
scraped_data = [p.get_text() for p in paragraphs] |
|
formatted_data = u"\n".join(scraped_data) |
|
|
|
return formatted_data |
|
else: |
|
return "Failed to retrieve the webpage (Status Code: {})".format(response.status_code) |
|
|
|
except requests.exceptions.ReadTimeout: |
|
|
|
return "Request timed out after 20 seconds." |
|
except requests.exceptions.SSLError as e: |
|
return "Request Error: {}".format(e) |
|
except requests.exceptions.RequestException as e: |
|
|
|
return "An error occurred: {}".format(e) |
|
|
|
|
|
def update_dataframe_with_results(organic_results): |
|
|
|
max_chars = 100000 |
|
data = [] |
|
for result in organic_results: |
|
|
|
scraped_content = scrape_website(result.get('link')) |
|
|
|
if len(scraped_content) > max_chars: |
|
scraped_content = scraped_content[:max_chars] |
|
data.append({ |
|
"Title": result.get('title'), |
|
"Link": result.get('link'), |
|
"Snippet": result.get('snippet'), |
|
"Displayed Link": result.get('displayed_link'), |
|
"Date": result.get('date'), |
|
"Rich Snippet": result.get('rich_snippet'), |
|
"Scraped Content": scraped_content |
|
}) |
|
|
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
def opencall(text,user_query): |
|
print("Calling opencall function with", len(text), "characters") |
|
|
|
completion = client.chat.completions.create( |
|
model="gpt-4-0125-preview", |
|
|
|
temperature=0.1, |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful assistant, specialised in preparing contents for preparing a presentation."}, |
|
{"role": "system", "content": "Your task is to prepare a base report on the topics, themes and trends addressed in the latest conferences, seminars and symposiums." }, |
|
{"role": "system", "content": "For this matter I will be providing you in the Information Pool a compilation of several scraped google search results from the latest conferences, seminars and symposiums on the topic: "+user_query}, |
|
{"role": "system", "content": "Each piece of Scraped Content start with the tag '### Title:' indicating the title, followed by the URL reference '### Link:' , followed by the contents '### Content:'"}, |
|
{"role": "system", "content": "Process all the information in the Information Pool to provide:"}, |
|
{"role": "system", "content": "1) Perspective of Relevant Information: Assess and extract the most relevant information from the point of view of this aspect: "+user_query+"."}, |
|
{"role": "system", "content": "2) Perspective of Key Topics: Highlight the key topics and themes.Cite the URLs that source those topics and themes"}, |
|
{"role": "system", "content": "3) Perspective of Emergent Trends: Highlight the emergent trends.Cite the URLs that source those trends."}, |
|
{"role": "system", "content": "In the response, use the indicated structure of 1)Perspective of Relevant Information 2)Perspective of Key Topics 3)Perspective of Emergent Trends"}, |
|
{"role": "user", "content":"Information Pool:"+text} |
|
] |
|
) |
|
response = completion.choices[0].message.content |
|
response = response + "\n" + "XXXXX" + "\n" |
|
return response |
|
|
|
|
|
def split_large_content(content, max_length=30000): |
|
|
|
title_and_source_end = content.find('\n\n') + 2 |
|
title_and_source = content[:title_and_source_end] |
|
title_and_source_length = len(title_and_source) |
|
|
|
|
|
max_segment_length = max_length - title_and_source_length |
|
|
|
segments = [] |
|
content_body = content[title_and_source_end:] |
|
|
|
|
|
while len(content_body) > 0: |
|
|
|
segment = content_body[:max_segment_length] |
|
|
|
|
|
if len(content_body) > max_segment_length: |
|
last_space = segment.rfind(' ') |
|
segment = segment[:last_space] |
|
|
|
|
|
full_segment = title_and_source + segment |
|
segments.append(full_segment) |
|
|
|
|
|
content_body = content_body[len(segment):] |
|
|
|
return segments |
|
|
|
|
|
|
|
def main(df,google_search_query): |
|
|
|
information_pool = "" |
|
archivo1="" |
|
|
|
with open('respuestas.txt', mode='a', encoding='utf-8') as file: |
|
|
|
|
|
for index, row in df.iterrows(): |
|
|
|
document_name = row['Title'] |
|
raw_content = str(row['Scraped Content']) |
|
link = row['Link'] |
|
|
|
|
|
processed_content = "### Title: " + row['Title'] + "\n" + "### Link: " + row['Link'] + "\n" + "### Content: " + str(row['Scraped Content']) + "\n" + "\n" |
|
|
|
print(document_name, ":", len(processed_content)) |
|
|
|
print("acumulado:", len(information_pool + processed_content)) |
|
|
|
|
|
if len(processed_content) > 30000: |
|
content_segments = split_large_content(processed_content) |
|
for segment in content_segments: |
|
print("EN C, Nuevo valor de Text:", len(segment)) |
|
|
|
response = opencall(segment,google_search_query) |
|
archivo1=archivo1+response+'\n' |
|
file.write(response + '\n') |
|
|
|
else: |
|
|
|
if len(information_pool + processed_content) <= 30000: |
|
information_pool += processed_content |
|
print("EN A, Nuevo valor de Text:", len(information_pool)) |
|
else: |
|
|
|
print("EN B1, llamando con valor de Text:", len(information_pool)) |
|
|
|
response = opencall(information_pool,google_search_query) |
|
file.write(response + '\n') |
|
archivo1=archivo1+response+'\n' |
|
information_pool = processed_content |
|
print("EN B2, nuevo valor de Text:", len(information_pool), " Con documento:", document_name) |
|
|
|
|
|
if information_pool: |
|
print("Final call") |
|
response = opencall(information_pool,google_search_query) |
|
file.write(response + '\n') |
|
archivo1=archivo1+response+'\n' |
|
return archivo1 |
|
|
|
def rearrange_text(text): |
|
|
|
batches = text.split('XXXXX') |
|
|
|
|
|
all_texta = "" |
|
all_textb = "" |
|
all_textc = "" |
|
|
|
|
|
markers = { |
|
'texta_marker': "Perspective of Relevant Information", |
|
'textb_marker': "Perspective of Key Emerging Aspects", |
|
'textc_marker': "Perspective of Key Entities" |
|
} |
|
|
|
|
|
for batch in batches: |
|
|
|
texta_start = batch.find(markers['texta_marker']) |
|
textb_start = batch.find(markers['textb_marker']) |
|
textc_start = batch.find(markers['textc_marker']) |
|
|
|
|
|
|
|
texta = batch[texta_start:textb_start] if textb_start != -1 else batch[texta_start:] |
|
textb = batch[textb_start:textc_start] if textc_start != -1 else batch[textb_start:] |
|
textc = batch[textc_start:] |
|
|
|
|
|
texta = texta.replace(markers['texta_marker'], '').strip() |
|
textb = textb.replace(markers['textb_marker'], '').strip() |
|
textc = textc.replace(markers['textc_marker'], '').strip() |
|
|
|
|
|
all_texta += "\n" + texta if all_texta else texta |
|
all_textb += "\n" + textb if all_textb else textb |
|
all_textc += "\n" + textc if all_textc else textc |
|
|
|
|
|
return all_texta, all_textb, all_textc |
|
|
|
def resumen(text): |
|
texta, textb, textc = rearrange_text(text) |
|
|
|
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[ |
|
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."}, |
|
{"role": "system", "content": "Your task is to provide an integrated comprehensive 2000 words narrative of the different points indicated in the Information Pool text for a internal report on recent news." }, |
|
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" }, |
|
{"role": "system", "content": "Be exhaustive, comprehensive and detailed in addressing the relation of different points indicated in the Information Pool text." }, |
|
{"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts, integrating them with a fluent narrative." }, |
|
{"role": "system", "content": "Start directly with the narrative, do not introduce the text, as it is part of a broader report." }, |
|
{"role": "system", "content": "Use a formal writing style, yet plain and easy to read. Avoid pomposity and making up artificial descriptions. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " }, |
|
|
|
|
|
{"role": "user", "content":"Information Pool:"+texta} ] ) |
|
|
|
|
|
|
|
response1 = completion.choices[0].message.content if completion.choices[0].message else "" |
|
response_1="1) Perspective of Relevant Information:"+"\n"+response1+"\n" |
|
|
|
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[ |
|
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."}, |
|
{"role": "system", "content": "Your task is to provide a comprehensive and integrated relation of about 2000 words in length of the different emerging aspects indicated in the Information Pool text for a internal report on recent news." }, |
|
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" }, |
|
{"role": "system", "content": "Be exhaustive, comprehensive and detailed in the relation." }, |
|
{"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts." }, |
|
{"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." }, |
|
{"role": "system", "content": "Use a formal writing style, yet plain and easy to read. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " }, |
|
|
|
|
|
{"role": "user", "content":"Information Pool:"+textb} ] ) |
|
response2 = completion.choices[0].message.content if completion.choices[0].message else "" |
|
response_2=" 2)Perspective of Key emerging aspects:"+"\n"+response2+"\n" |
|
|
|
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[ |
|
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."}, |
|
{"role": "system", "content": "Your task is to consolidate and sore the relation of the different entities indicated in the Information Pool text for a internal report on recent news." }, |
|
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" }, |
|
{"role": "system", "content": "Be exhaustive in the sorting. Sort around similar entry types: Organization, Program, Technology, Entity, ... You can merge similar entry types (i.e. Technologies and Technology Terms and Concepts, People and Officials,...)" }, |
|
{"role": "system", "content": "Arrange and integrate entries around similar or related concepts. Discard duplicated concepts or elements." }, |
|
{"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." }, |
|
{"role": "system", "content": "The audience is well acquainted with technical and defence/military vocabulary, information and entities. " }, |
|
|
|
|
|
{"role": "user", "content":"Information Pool:"+textc} ] ) |
|
|
|
response3 = completion.choices[0].message.content if completion.choices[0].message else "" |
|
response_3=" 3)Perspective of of Key Entities"+"\n"+response3+"\n" |
|
compilacion=response_1+"\n"+response_2+"\n"+response_3 |
|
print(compilacion) |
|
print("\n\n") |
|
print("\n\n") |
|
return compilacion |
|
|
|
|
|
def get_organic_results(query, periodo_tbs, num_results): |
|
params = { |
|
"q": query, |
|
"num": str(num_results), |
|
"tbs": periodo_tbs, |
|
"api_key": SERPAPI_KEY |
|
} |
|
search = GoogleSearch(params) |
|
results = search.get_dict() |
|
organic_results = results.get("organic_results", []) |
|
|
|
for result in organic_results: |
|
title = result.get('title') |
|
print("Title:", title) |
|
print() |
|
|
|
return organic_results |
|
|
|
|
|
def process_inputs(task_type, topic, integration_period, num_results): |
|
|
|
google_search_query = f'"{topic}" Conferences OR seminars OR SYMPOSIUMS' |
|
periodo_tbs = integration_period |
|
num_resultados = int(num_results) |
|
|
|
|
|
results = get_organic_results(google_search_query, periodo_tbs, num_resultados) |
|
df = update_dataframe_with_results(results) |
|
archivo1 = main(df, google_search_query) |
|
resumen_text = resumen(archivo1) |
|
|
|
return archivo1,resumen_text |
|
|
|
|
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
with gr.Column(): |
|
task_type = gr.Dropdown(choices=["Conferencias", "Seminarios", "Simposios"], label="Selecciona el tipo de tarea:") |
|
topic = gr.Textbox(label="Aspecto o Tema sobre el que trabajar", placeholder="Ingrese el tema aquí...") |
|
integration_period = gr.Dropdown(choices=["1M", "3M", "6M", "1Y"], label="Periodo de integración de información") |
|
num_results = gr.Number(label="Número de resultados sobre los que trabajar", value=10) |
|
submit_button = gr.Button("Submit") |
|
with gr.Column(): |
|
output_text_intermedio = gr.Textbox(label="Resultados Intermedios", interactive=True, lines=10) |
|
output_text_final = gr.Textbox(label="Resultados Compilados", interactive=True, lines=10) |
|
|
|
|
|
|
|
submit_button.click( |
|
fn=process_inputs, |
|
inputs=[task_type, topic, integration_period, num_results], |
|
outputs=[output_text_intermedio,output_text_final] |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |