Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
import openai
|
3 |
+
from openai import OpenAI
|
4 |
+
import os
|
5 |
+
from groq import Groq
|
6 |
+
import requests
|
7 |
+
import time
|
8 |
+
from html.parser import HTMLParser
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import json
|
11 |
+
from datetime import datetime
|
12 |
+
import pandas as pd
|
13 |
+
from serpapi import GoogleSearch
|
14 |
+
import gradio as gr
|
15 |
+
|
16 |
+
GROQ_API_KEY=getenv("GROQ_API_KEY")
|
17 |
+
client_groq = Groq(api_key=GROQ_API_KEY,)
|
18 |
+
openai_key=getenv("OPENAI_API_KEY")
|
19 |
+
os.environ["OPENAI_API_KEY"] = openai_key
|
20 |
+
client = OpenAI()
|
21 |
+
|
22 |
+
SERPAPI_KEY=getenv("SERPAPI_KEY")
|
23 |
+
|
24 |
+
api_key = os.getenv("API_KEY")
|
25 |
+
|
26 |
+
def scrape_website(url):
|
27 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
28 |
+
try:
|
29 |
+
response = requests.get(url, headers=headers, timeout=20)
|
30 |
+
response.encoding = response.apparent_encoding # Set encoding to match the content
|
31 |
+
|
32 |
+
if response.status_code == 200:
|
33 |
+
page_content = response.content
|
34 |
+
soup = BeautifulSoup(page_content, 'html.parser')
|
35 |
+
paragraphs = soup.find_all('p')
|
36 |
+
scraped_data = [p.get_text() for p in paragraphs]
|
37 |
+
formatted_data = u"\n".join(scraped_data)
|
38 |
+
|
39 |
+
return formatted_data # Return only content
|
40 |
+
else:
|
41 |
+
return "Failed to retrieve the webpage (Status Code: {})".format(response.status_code)
|
42 |
+
|
43 |
+
except requests.exceptions.ReadTimeout:
|
44 |
+
# Handle the timeout exception
|
45 |
+
return "Request timed out after 20 seconds."
|
46 |
+
except requests.exceptions.SSLError as e:
|
47 |
+
return "Request Error: {}".format(e)
|
48 |
+
except requests.exceptions.RequestException as e:
|
49 |
+
# Handle other requests-related exceptions
|
50 |
+
return "An error occurred: {}".format(e)
|
51 |
+
|
52 |
+
|
53 |
+
def update_dataframe_with_results(organic_results):
|
54 |
+
# Prepare data for DataFrame
|
55 |
+
max_chars = 100000 # Maximum characters allowed in a cell
|
56 |
+
data = []
|
57 |
+
for result in organic_results:
|
58 |
+
# Scrape the website content
|
59 |
+
scraped_content = scrape_website(result.get('link'))
|
60 |
+
# Truncate the content if it exceeds the limit
|
61 |
+
if len(scraped_content) > max_chars:
|
62 |
+
scraped_content = scraped_content[:max_chars]
|
63 |
+
data.append({
|
64 |
+
"Title": result.get('title'),
|
65 |
+
"Link": result.get('link'),
|
66 |
+
"Snippet": result.get('snippet'),
|
67 |
+
"Displayed Link": result.get('displayed_link'),
|
68 |
+
"Date": result.get('date'), # Might not always be present
|
69 |
+
"Rich Snippet": result.get('rich_snippet'), # Might not always be present
|
70 |
+
"Scraped Content": scraped_content # Add scraped content
|
71 |
+
})
|
72 |
+
|
73 |
+
df = pd.DataFrame(data)
|
74 |
+
return df
|
75 |
+
|
76 |
+
def opencall(text,user_query):
|
77 |
+
print("Calling opencall function with", len(text), "characters")
|
78 |
+
#completion = client_groq.chat.completions.create(
|
79 |
+
completion = client.chat.completions.create(
|
80 |
+
model="gpt-4-0125-preview",
|
81 |
+
#model="mixtral-8x7b-32768",
|
82 |
+
temperature=0.1,
|
83 |
+
messages=[
|
84 |
+
{"role": "system", "content": "You are a helpful assistant, specialised in preparing contents for preparing a presentation."},
|
85 |
+
{"role": "system", "content": "Your task is to prepare a base report on the topics, themes and trends addressed in the latest conferences, seminars and symposiums." },
|
86 |
+
{"role": "system", "content": "For this matter I will be providing you in the Information Pool a compilation of several scraped google search results from the latest conferences, seminars and symposiums on the topic: "+user_query},
|
87 |
+
{"role": "system", "content": "Each piece of Scraped Content start with the tag '### Title:' indicating the title, followed by the URL reference '### Link:' , followed by the contents '### Content:'"},
|
88 |
+
{"role": "system", "content": "Process all the information in the Information Pool to provide:"},
|
89 |
+
{"role": "system", "content": "1) Perspective of Relevant Information: Assess and extract the most relevant information from the point of view of this aspect: "+user_query+"."},
|
90 |
+
{"role": "system", "content": "2) Perspective of Key Topics: Highlight the key topics and themes.Cite the URLs that source those topics and themes"},
|
91 |
+
{"role": "system", "content": "3) Perspective of Emergent Trends: Highlight the emergent trends.Cite the URLs that source those trends."},
|
92 |
+
{"role": "system", "content": "In the response, use the indicated structure of 1)Perspective of Relevant Information 2)Perspective of Key Topics 3)Perspective of Emergent Trends"},
|
93 |
+
{"role": "user", "content":"Information Pool:"+text}
|
94 |
+
]
|
95 |
+
)
|
96 |
+
response = completion.choices[0].message.content
|
97 |
+
response = response + "\n" + "XXXXX" + "\n"
|
98 |
+
return response
|
99 |
+
|
100 |
+
|
101 |
+
def split_large_content(content, max_length=30000):
|
102 |
+
# Extract the title and source URL, assuming they end with the second newline
|
103 |
+
title_and_source_end = content.find('\n\n') + 2
|
104 |
+
title_and_source = content[:title_and_source_end]
|
105 |
+
title_and_source_length = len(title_and_source)
|
106 |
+
|
107 |
+
# Ensure each segment has space for the title and source by reducing max_length
|
108 |
+
max_segment_length = max_length - title_and_source_length
|
109 |
+
|
110 |
+
segments = []
|
111 |
+
content_body = content[title_and_source_end:]
|
112 |
+
|
113 |
+
# Start splitting the content_body into segments
|
114 |
+
while len(content_body) > 0:
|
115 |
+
# Take a slice of content up to max_segment_length
|
116 |
+
segment = content_body[:max_segment_length]
|
117 |
+
|
118 |
+
# If we're not at the end of content_body, back-track to the last complete word
|
119 |
+
if len(content_body) > max_segment_length:
|
120 |
+
last_space = segment.rfind(' ')
|
121 |
+
segment = segment[:last_space]
|
122 |
+
|
123 |
+
# Add the title and source URL to the start of this segment
|
124 |
+
full_segment = title_and_source + segment
|
125 |
+
segments.append(full_segment)
|
126 |
+
|
127 |
+
# Move forward in content_body by the length of the segment minus the title/source
|
128 |
+
content_body = content_body[len(segment):]
|
129 |
+
|
130 |
+
return segments
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
def main(df,google_search_query):
|
135 |
+
# Initialize a string to accumulate the information
|
136 |
+
information_pool = ""
|
137 |
+
archivo1=""
|
138 |
+
# Open or create a plain text file in append mode
|
139 |
+
with open('respuestas.txt', mode='a', encoding='utf-8') as file:
|
140 |
+
|
141 |
+
# Iterate over the rows of the DataFrame
|
142 |
+
for index, row in df.iterrows():
|
143 |
+
# Combine title, link, and content into a single string
|
144 |
+
document_name = row['Title'] # Using title as document_name
|
145 |
+
raw_content = str(row['Scraped Content']) # Convert to string to ensure compatibility
|
146 |
+
link = row['Link'] # Retrieve link for additional usage or logging
|
147 |
+
|
148 |
+
# Assuming process_document_content is a function you've defined to process the content
|
149 |
+
processed_content = "### Title: " + row['Title'] + "\n" + "### Link: " + row['Link'] + "\n" + "### Content: " + str(row['Scraped Content']) + "\n" + "\n"
|
150 |
+
|
151 |
+
print(document_name, ":", len(processed_content))
|
152 |
+
#print("Contenido:", processed_content)
|
153 |
+
print("acumulado:", len(information_pool + processed_content))
|
154 |
+
|
155 |
+
# Handle long content by splitting and processing in segments
|
156 |
+
if len(processed_content) > 30000:
|
157 |
+
content_segments = split_large_content(processed_content)
|
158 |
+
for segment in content_segments:
|
159 |
+
print("EN C, Nuevo valor de Text:", len(segment))
|
160 |
+
#print("segmen:",segment)
|
161 |
+
response = opencall(segment,google_search_query) # Replace 'opencall' with your actual function call
|
162 |
+
archivo1=archivo1+response+'\n'
|
163 |
+
file.write(response + '\n')
|
164 |
+
|
165 |
+
else:
|
166 |
+
# Check if adding processed content exceeds the size limit
|
167 |
+
if len(information_pool + processed_content) <= 30000:
|
168 |
+
information_pool += processed_content
|
169 |
+
print("EN A, Nuevo valor de Text:", len(information_pool))
|
170 |
+
else:
|
171 |
+
# Process current accumulated content and start new accumulation
|
172 |
+
print("EN B1, llamando con valor de Text:", len(information_pool))
|
173 |
+
#print("Information pool", information_pool)
|
174 |
+
response = opencall(information_pool,google_search_query)
|
175 |
+
file.write(response + '\n')
|
176 |
+
archivo1=archivo1+response+'\n'
|
177 |
+
information_pool = processed_content
|
178 |
+
print("EN B2, nuevo valor de Text:", len(information_pool), " Con documento:", document_name)
|
179 |
+
|
180 |
+
# Handle any remaining content after loop
|
181 |
+
if information_pool:
|
182 |
+
print("Final call")
|
183 |
+
response = opencall(information_pool,google_search_query)
|
184 |
+
file.write(response + '\n')
|
185 |
+
archivo1=archivo1+response+'\n'
|
186 |
+
return archivo1
|
187 |
+
|
188 |
+
def rearrange_text(text):
|
189 |
+
# Split the text into batches using 'XXXXX'
|
190 |
+
batches = text.split('XXXXX')
|
191 |
+
|
192 |
+
# Initialize variables to store concatenated texts
|
193 |
+
all_texta = ""
|
194 |
+
all_textb = ""
|
195 |
+
all_textc = ""
|
196 |
+
|
197 |
+
# Define markers for different sections
|
198 |
+
markers = {
|
199 |
+
'texta_marker': "Perspective of Relevant Information",
|
200 |
+
'textb_marker': "Perspective of Key Emerging Aspects",
|
201 |
+
'textc_marker': "Perspective of Key Entities"
|
202 |
+
}
|
203 |
+
|
204 |
+
# Process each batch
|
205 |
+
for batch in batches:
|
206 |
+
# Initialize indices for each section
|
207 |
+
texta_start = batch.find(markers['texta_marker'])
|
208 |
+
textb_start = batch.find(markers['textb_marker'])
|
209 |
+
textc_start = batch.find(markers['textc_marker'])
|
210 |
+
|
211 |
+
# Extract TEXTA, TEXTB, and TEXTC using the found indices
|
212 |
+
# Check if the markers are found; if not, skip to the next marker
|
213 |
+
texta = batch[texta_start:textb_start] if textb_start != -1 else batch[texta_start:]
|
214 |
+
textb = batch[textb_start:textc_start] if textc_start != -1 else batch[textb_start:]
|
215 |
+
textc = batch[textc_start:]
|
216 |
+
|
217 |
+
# Remove the markers from the beginning of each text
|
218 |
+
texta = texta.replace(markers['texta_marker'], '').strip()
|
219 |
+
textb = textb.replace(markers['textb_marker'], '').strip()
|
220 |
+
textc = textc.replace(markers['textc_marker'], '').strip()
|
221 |
+
|
222 |
+
# Concatenate texts from all batches
|
223 |
+
all_texta += "\n" + texta if all_texta else texta
|
224 |
+
all_textb += "\n" + textb if all_textb else textb
|
225 |
+
all_textc += "\n" + textc if all_textc else textc
|
226 |
+
|
227 |
+
# You can now use all_texta, all_textb, and all_textc for further summarization or processing
|
228 |
+
return all_texta, all_textb, all_textc
|
229 |
+
|
230 |
+
def resumen(text):
|
231 |
+
texta, textb, textc = rearrange_text(text)
|
232 |
+
|
233 |
+
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
|
234 |
+
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
|
235 |
+
{"role": "system", "content": "Your task is to provide an integrated comprehensive 2000 words narrative of the different points indicated in the Information Pool text for a internal report on recent news." },
|
236 |
+
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
|
237 |
+
{"role": "system", "content": "Be exhaustive, comprehensive and detailed in addressing the relation of different points indicated in the Information Pool text." },
|
238 |
+
{"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts, integrating them with a fluent narrative." },
|
239 |
+
{"role": "system", "content": "Start directly with the narrative, do not introduce the text, as it is part of a broader report." },
|
240 |
+
{"role": "system", "content": "Use a formal writing style, yet plain and easy to read. Avoid pomposity and making up artificial descriptions. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
|
241 |
+
|
242 |
+
|
243 |
+
{"role": "user", "content":"Information Pool:"+texta} ] )
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
response1 = completion.choices[0].message.content if completion.choices[0].message else ""
|
248 |
+
response_1="1) Perspective of Relevant Information:"+"\n"+response1+"\n"
|
249 |
+
|
250 |
+
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
|
251 |
+
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
|
252 |
+
{"role": "system", "content": "Your task is to provide a comprehensive and integrated relation of about 2000 words in length of the different emerging aspects indicated in the Information Pool text for a internal report on recent news." },
|
253 |
+
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
|
254 |
+
{"role": "system", "content": "Be exhaustive, comprehensive and detailed in the relation." },
|
255 |
+
{"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts." },
|
256 |
+
{"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
|
257 |
+
{"role": "system", "content": "Use a formal writing style, yet plain and easy to read. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
|
258 |
+
|
259 |
+
|
260 |
+
{"role": "user", "content":"Information Pool:"+textb} ] )
|
261 |
+
response2 = completion.choices[0].message.content if completion.choices[0].message else ""
|
262 |
+
response_2=" 2)Perspective of Key emerging aspects:"+"\n"+response2+"\n"
|
263 |
+
|
264 |
+
completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
|
265 |
+
{"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
|
266 |
+
{"role": "system", "content": "Your task is to consolidate and sore the relation of the different entities indicated in the Information Pool text for a internal report on recent news." },
|
267 |
+
{"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
|
268 |
+
{"role": "system", "content": "Be exhaustive in the sorting. Sort around similar entry types: Organization, Program, Technology, Entity, ... You can merge similar entry types (i.e. Technologies and Technology Terms and Concepts, People and Officials,...)" },
|
269 |
+
{"role": "system", "content": "Arrange and integrate entries around similar or related concepts. Discard duplicated concepts or elements." },
|
270 |
+
{"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
|
271 |
+
{"role": "system", "content": "The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
|
272 |
+
|
273 |
+
|
274 |
+
{"role": "user", "content":"Information Pool:"+textc} ] )
|
275 |
+
|
276 |
+
response3 = completion.choices[0].message.content if completion.choices[0].message else ""
|
277 |
+
response_3=" 3)Perspective of of Key Entities"+"\n"+response3+"\n"
|
278 |
+
compilacion=response_1+"\n"+response_2+"\n"+response_3
|
279 |
+
print(compilacion)
|
280 |
+
print("\n\n")
|
281 |
+
print("\n\n")
|
282 |
+
return compilacion
|
283 |
+
|
284 |
+
# Define the function to get news results
|
285 |
+
def get_organic_results(query, periodo_tbs, num_results):
|
286 |
+
params = {
|
287 |
+
"q": query,
|
288 |
+
"num": str(num_results),
|
289 |
+
"tbs": periodo_tbs, # quiero los resultados del último año
|
290 |
+
"api_key": SERPAPI_KEY
|
291 |
+
}
|
292 |
+
search = GoogleSearch(params)
|
293 |
+
results = search.get_dict()
|
294 |
+
organic_results = results.get("organic_results", []) # Change from "news_results" to "organic_results"
|
295 |
+
|
296 |
+
for result in organic_results:
|
297 |
+
title = result.get('title')
|
298 |
+
print("Title:", title)
|
299 |
+
print() # Print a newline for better readability between results
|
300 |
+
|
301 |
+
return organic_results
|
302 |
+
|
303 |
+
|
304 |
+
def process_inputs(task_type, topic, integration_period, num_results):
|
305 |
+
# Construct the query based on user input
|
306 |
+
google_search_query = f'"{topic}" Conferences OR seminars OR SYMPOSIUMS'
|
307 |
+
periodo_tbs = integration_period
|
308 |
+
num_resultados = int(num_results)
|
309 |
+
|
310 |
+
# Fetch results based on the user's query
|
311 |
+
results = get_organic_results(google_search_query, periodo_tbs, num_resultados)
|
312 |
+
df = update_dataframe_with_results(results)
|
313 |
+
archivo1 = main(df, google_search_query)
|
314 |
+
resumen_text = resumen(archivo1)
|
315 |
+
|
316 |
+
return archivo1,resumen_text
|
317 |
+
|
318 |
+
# Create the Gradio blocks interface
|
319 |
+
with gr.Blocks() as app:
|
320 |
+
with gr.Row():
|
321 |
+
with gr.Column():
|
322 |
+
task_type = gr.Dropdown(choices=["Conferencias", "Seminarios", "Simposios"], label="Selecciona el tipo de tarea:")
|
323 |
+
topic = gr.Textbox(label="Aspecto o Tema sobre el que trabajar", placeholder="Ingrese el tema aquí...")
|
324 |
+
integration_period = gr.Dropdown(choices=["1M", "3M", "6M", "1Y"], label="Periodo de integración de información")
|
325 |
+
num_results = gr.Number(label="Número de resultados sobre los que trabajar", value=10)
|
326 |
+
submit_button = gr.Button("Submit")
|
327 |
+
with gr.Column():
|
328 |
+
output_text_intermedio = gr.Textbox(label="Resultados Intermedios", interactive=True, lines=10)
|
329 |
+
output_text_final = gr.Textbox(label="Resultados Compilados", interactive=True, lines=10)
|
330 |
+
|
331 |
+
|
332 |
+
# Define what happens when you click the Submit button
|
333 |
+
submit_button.click(
|
334 |
+
fn=process_inputs,
|
335 |
+
inputs=[task_type, topic, integration_period, num_results],
|
336 |
+
outputs=[output_text_intermedio,output_text_final]
|
337 |
+
)
|
338 |
+
|
339 |
+
if __name__ == "__main__":
|
340 |
+
app.launch()
|