LAPINILLA commited on
Commit
018fbde
1 Parent(s): 0a4f5bb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -0
app.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import openai
3
+ from openai import OpenAI
4
+ import os
5
+ from groq import Groq
6
+ import requests
7
+ import time
8
+ from html.parser import HTMLParser
9
+ from bs4 import BeautifulSoup
10
+ import json
11
+ from datetime import datetime
12
+ import pandas as pd
13
+ from serpapi import GoogleSearch
14
+ import gradio as gr
15
+
16
+ GROQ_API_KEY=getenv("GROQ_API_KEY")
17
+ client_groq = Groq(api_key=GROQ_API_KEY,)
18
+ openai_key=getenv("OPENAI_API_KEY")
19
+ os.environ["OPENAI_API_KEY"] = openai_key
20
+ client = OpenAI()
21
+
22
+ SERPAPI_KEY=getenv("SERPAPI_KEY")
23
+
24
+ api_key = os.getenv("API_KEY")
25
+
26
+ def scrape_website(url):
27
+ headers = {'User-Agent': 'Mozilla/5.0'}
28
+ try:
29
+ response = requests.get(url, headers=headers, timeout=20)
30
+ response.encoding = response.apparent_encoding # Set encoding to match the content
31
+
32
+ if response.status_code == 200:
33
+ page_content = response.content
34
+ soup = BeautifulSoup(page_content, 'html.parser')
35
+ paragraphs = soup.find_all('p')
36
+ scraped_data = [p.get_text() for p in paragraphs]
37
+ formatted_data = u"\n".join(scraped_data)
38
+
39
+ return formatted_data # Return only content
40
+ else:
41
+ return "Failed to retrieve the webpage (Status Code: {})".format(response.status_code)
42
+
43
+ except requests.exceptions.ReadTimeout:
44
+ # Handle the timeout exception
45
+ return "Request timed out after 20 seconds."
46
+ except requests.exceptions.SSLError as e:
47
+ return "Request Error: {}".format(e)
48
+ except requests.exceptions.RequestException as e:
49
+ # Handle other requests-related exceptions
50
+ return "An error occurred: {}".format(e)
51
+
52
+
53
+ def update_dataframe_with_results(organic_results):
54
+ # Prepare data for DataFrame
55
+ max_chars = 100000 # Maximum characters allowed in a cell
56
+ data = []
57
+ for result in organic_results:
58
+ # Scrape the website content
59
+ scraped_content = scrape_website(result.get('link'))
60
+ # Truncate the content if it exceeds the limit
61
+ if len(scraped_content) > max_chars:
62
+ scraped_content = scraped_content[:max_chars]
63
+ data.append({
64
+ "Title": result.get('title'),
65
+ "Link": result.get('link'),
66
+ "Snippet": result.get('snippet'),
67
+ "Displayed Link": result.get('displayed_link'),
68
+ "Date": result.get('date'), # Might not always be present
69
+ "Rich Snippet": result.get('rich_snippet'), # Might not always be present
70
+ "Scraped Content": scraped_content # Add scraped content
71
+ })
72
+
73
+ df = pd.DataFrame(data)
74
+ return df
75
+
76
+ def opencall(text,user_query):
77
+ print("Calling opencall function with", len(text), "characters")
78
+ #completion = client_groq.chat.completions.create(
79
+ completion = client.chat.completions.create(
80
+ model="gpt-4-0125-preview",
81
+ #model="mixtral-8x7b-32768",
82
+ temperature=0.1,
83
+ messages=[
84
+ {"role": "system", "content": "You are a helpful assistant, specialised in preparing contents for preparing a presentation."},
85
+ {"role": "system", "content": "Your task is to prepare a base report on the topics, themes and trends addressed in the latest conferences, seminars and symposiums." },
86
+ {"role": "system", "content": "For this matter I will be providing you in the Information Pool a compilation of several scraped google search results from the latest conferences, seminars and symposiums on the topic: "+user_query},
87
+ {"role": "system", "content": "Each piece of Scraped Content start with the tag '### Title:' indicating the title, followed by the URL reference '### Link:' , followed by the contents '### Content:'"},
88
+ {"role": "system", "content": "Process all the information in the Information Pool to provide:"},
89
+ {"role": "system", "content": "1) Perspective of Relevant Information: Assess and extract the most relevant information from the point of view of this aspect: "+user_query+"."},
90
+ {"role": "system", "content": "2) Perspective of Key Topics: Highlight the key topics and themes.Cite the URLs that source those topics and themes"},
91
+ {"role": "system", "content": "3) Perspective of Emergent Trends: Highlight the emergent trends.Cite the URLs that source those trends."},
92
+ {"role": "system", "content": "In the response, use the indicated structure of 1)Perspective of Relevant Information 2)Perspective of Key Topics 3)Perspective of Emergent Trends"},
93
+ {"role": "user", "content":"Information Pool:"+text}
94
+ ]
95
+ )
96
+ response = completion.choices[0].message.content
97
+ response = response + "\n" + "XXXXX" + "\n"
98
+ return response
99
+
100
+
101
+ def split_large_content(content, max_length=30000):
102
+ # Extract the title and source URL, assuming they end with the second newline
103
+ title_and_source_end = content.find('\n\n') + 2
104
+ title_and_source = content[:title_and_source_end]
105
+ title_and_source_length = len(title_and_source)
106
+
107
+ # Ensure each segment has space for the title and source by reducing max_length
108
+ max_segment_length = max_length - title_and_source_length
109
+
110
+ segments = []
111
+ content_body = content[title_and_source_end:]
112
+
113
+ # Start splitting the content_body into segments
114
+ while len(content_body) > 0:
115
+ # Take a slice of content up to max_segment_length
116
+ segment = content_body[:max_segment_length]
117
+
118
+ # If we're not at the end of content_body, back-track to the last complete word
119
+ if len(content_body) > max_segment_length:
120
+ last_space = segment.rfind(' ')
121
+ segment = segment[:last_space]
122
+
123
+ # Add the title and source URL to the start of this segment
124
+ full_segment = title_and_source + segment
125
+ segments.append(full_segment)
126
+
127
+ # Move forward in content_body by the length of the segment minus the title/source
128
+ content_body = content_body[len(segment):]
129
+
130
+ return segments
131
+
132
+
133
+
134
+ def main(df,google_search_query):
135
+ # Initialize a string to accumulate the information
136
+ information_pool = ""
137
+ archivo1=""
138
+ # Open or create a plain text file in append mode
139
+ with open('respuestas.txt', mode='a', encoding='utf-8') as file:
140
+
141
+ # Iterate over the rows of the DataFrame
142
+ for index, row in df.iterrows():
143
+ # Combine title, link, and content into a single string
144
+ document_name = row['Title'] # Using title as document_name
145
+ raw_content = str(row['Scraped Content']) # Convert to string to ensure compatibility
146
+ link = row['Link'] # Retrieve link for additional usage or logging
147
+
148
+ # Assuming process_document_content is a function you've defined to process the content
149
+ processed_content = "### Title: " + row['Title'] + "\n" + "### Link: " + row['Link'] + "\n" + "### Content: " + str(row['Scraped Content']) + "\n" + "\n"
150
+
151
+ print(document_name, ":", len(processed_content))
152
+ #print("Contenido:", processed_content)
153
+ print("acumulado:", len(information_pool + processed_content))
154
+
155
+ # Handle long content by splitting and processing in segments
156
+ if len(processed_content) > 30000:
157
+ content_segments = split_large_content(processed_content)
158
+ for segment in content_segments:
159
+ print("EN C, Nuevo valor de Text:", len(segment))
160
+ #print("segmen:",segment)
161
+ response = opencall(segment,google_search_query) # Replace 'opencall' with your actual function call
162
+ archivo1=archivo1+response+'\n'
163
+ file.write(response + '\n')
164
+
165
+ else:
166
+ # Check if adding processed content exceeds the size limit
167
+ if len(information_pool + processed_content) <= 30000:
168
+ information_pool += processed_content
169
+ print("EN A, Nuevo valor de Text:", len(information_pool))
170
+ else:
171
+ # Process current accumulated content and start new accumulation
172
+ print("EN B1, llamando con valor de Text:", len(information_pool))
173
+ #print("Information pool", information_pool)
174
+ response = opencall(information_pool,google_search_query)
175
+ file.write(response + '\n')
176
+ archivo1=archivo1+response+'\n'
177
+ information_pool = processed_content
178
+ print("EN B2, nuevo valor de Text:", len(information_pool), " Con documento:", document_name)
179
+
180
+ # Handle any remaining content after loop
181
+ if information_pool:
182
+ print("Final call")
183
+ response = opencall(information_pool,google_search_query)
184
+ file.write(response + '\n')
185
+ archivo1=archivo1+response+'\n'
186
+ return archivo1
187
+
188
+ def rearrange_text(text):
189
+ # Split the text into batches using 'XXXXX'
190
+ batches = text.split('XXXXX')
191
+
192
+ # Initialize variables to store concatenated texts
193
+ all_texta = ""
194
+ all_textb = ""
195
+ all_textc = ""
196
+
197
+ # Define markers for different sections
198
+ markers = {
199
+ 'texta_marker': "Perspective of Relevant Information",
200
+ 'textb_marker': "Perspective of Key Emerging Aspects",
201
+ 'textc_marker': "Perspective of Key Entities"
202
+ }
203
+
204
+ # Process each batch
205
+ for batch in batches:
206
+ # Initialize indices for each section
207
+ texta_start = batch.find(markers['texta_marker'])
208
+ textb_start = batch.find(markers['textb_marker'])
209
+ textc_start = batch.find(markers['textc_marker'])
210
+
211
+ # Extract TEXTA, TEXTB, and TEXTC using the found indices
212
+ # Check if the markers are found; if not, skip to the next marker
213
+ texta = batch[texta_start:textb_start] if textb_start != -1 else batch[texta_start:]
214
+ textb = batch[textb_start:textc_start] if textc_start != -1 else batch[textb_start:]
215
+ textc = batch[textc_start:]
216
+
217
+ # Remove the markers from the beginning of each text
218
+ texta = texta.replace(markers['texta_marker'], '').strip()
219
+ textb = textb.replace(markers['textb_marker'], '').strip()
220
+ textc = textc.replace(markers['textc_marker'], '').strip()
221
+
222
+ # Concatenate texts from all batches
223
+ all_texta += "\n" + texta if all_texta else texta
224
+ all_textb += "\n" + textb if all_textb else textb
225
+ all_textc += "\n" + textc if all_textc else textc
226
+
227
+ # You can now use all_texta, all_textb, and all_textc for further summarization or processing
228
+ return all_texta, all_textb, all_textc
229
+
230
+ def resumen(text):
231
+ texta, textb, textc = rearrange_text(text)
232
+
233
+ completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
234
+ {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
235
+ {"role": "system", "content": "Your task is to provide an integrated comprehensive 2000 words narrative of the different points indicated in the Information Pool text for a internal report on recent news." },
236
+ {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
237
+ {"role": "system", "content": "Be exhaustive, comprehensive and detailed in addressing the relation of different points indicated in the Information Pool text." },
238
+ {"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts, integrating them with a fluent narrative." },
239
+ {"role": "system", "content": "Start directly with the narrative, do not introduce the text, as it is part of a broader report." },
240
+ {"role": "system", "content": "Use a formal writing style, yet plain and easy to read. Avoid pomposity and making up artificial descriptions. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
241
+
242
+
243
+ {"role": "user", "content":"Information Pool:"+texta} ] )
244
+
245
+
246
+
247
+ response1 = completion.choices[0].message.content if completion.choices[0].message else ""
248
+ response_1="1) Perspective of Relevant Information:"+"\n"+response1+"\n"
249
+
250
+ completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
251
+ {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
252
+ {"role": "system", "content": "Your task is to provide a comprehensive and integrated relation of about 2000 words in length of the different emerging aspects indicated in the Information Pool text for a internal report on recent news." },
253
+ {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
254
+ {"role": "system", "content": "Be exhaustive, comprehensive and detailed in the relation." },
255
+ {"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts." },
256
+ {"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
257
+ {"role": "system", "content": "Use a formal writing style, yet plain and easy to read. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
258
+
259
+
260
+ {"role": "user", "content":"Information Pool:"+textb} ] )
261
+ response2 = completion.choices[0].message.content if completion.choices[0].message else ""
262
+ response_2=" 2)Perspective of Key emerging aspects:"+"\n"+response2+"\n"
263
+
264
+ completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
265
+ {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
266
+ {"role": "system", "content": "Your task is to consolidate and sore the relation of the different entities indicated in the Information Pool text for a internal report on recent news." },
267
+ {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
268
+ {"role": "system", "content": "Be exhaustive in the sorting. Sort around similar entry types: Organization, Program, Technology, Entity, ... You can merge similar entry types (i.e. Technologies and Technology Terms and Concepts, People and Officials,...)" },
269
+ {"role": "system", "content": "Arrange and integrate entries around similar or related concepts. Discard duplicated concepts or elements." },
270
+ {"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
271
+ {"role": "system", "content": "The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
272
+
273
+
274
+ {"role": "user", "content":"Information Pool:"+textc} ] )
275
+
276
+ response3 = completion.choices[0].message.content if completion.choices[0].message else ""
277
+ response_3=" 3)Perspective of of Key Entities"+"\n"+response3+"\n"
278
+ compilacion=response_1+"\n"+response_2+"\n"+response_3
279
+ print(compilacion)
280
+ print("\n\n")
281
+ print("\n\n")
282
+ return compilacion
283
+
284
+ # Define the function to get news results
285
+ def get_organic_results(query, periodo_tbs, num_results):
286
+ params = {
287
+ "q": query,
288
+ "num": str(num_results),
289
+ "tbs": periodo_tbs, # quiero los resultados del último año
290
+ "api_key": SERPAPI_KEY
291
+ }
292
+ search = GoogleSearch(params)
293
+ results = search.get_dict()
294
+ organic_results = results.get("organic_results", []) # Change from "news_results" to "organic_results"
295
+
296
+ for result in organic_results:
297
+ title = result.get('title')
298
+ print("Title:", title)
299
+ print() # Print a newline for better readability between results
300
+
301
+ return organic_results
302
+
303
+
304
+ def process_inputs(task_type, topic, integration_period, num_results):
305
+ # Construct the query based on user input
306
+ google_search_query = f'"{topic}" Conferences OR seminars OR SYMPOSIUMS'
307
+ periodo_tbs = integration_period
308
+ num_resultados = int(num_results)
309
+
310
+ # Fetch results based on the user's query
311
+ results = get_organic_results(google_search_query, periodo_tbs, num_resultados)
312
+ df = update_dataframe_with_results(results)
313
+ archivo1 = main(df, google_search_query)
314
+ resumen_text = resumen(archivo1)
315
+
316
+ return archivo1,resumen_text
317
+
318
+ # Create the Gradio blocks interface
319
+ with gr.Blocks() as app:
320
+ with gr.Row():
321
+ with gr.Column():
322
+ task_type = gr.Dropdown(choices=["Conferencias", "Seminarios", "Simposios"], label="Selecciona el tipo de tarea:")
323
+ topic = gr.Textbox(label="Aspecto o Tema sobre el que trabajar", placeholder="Ingrese el tema aquí...")
324
+ integration_period = gr.Dropdown(choices=["1M", "3M", "6M", "1Y"], label="Periodo de integración de información")
325
+ num_results = gr.Number(label="Número de resultados sobre los que trabajar", value=10)
326
+ submit_button = gr.Button("Submit")
327
+ with gr.Column():
328
+ output_text_intermedio = gr.Textbox(label="Resultados Intermedios", interactive=True, lines=10)
329
+ output_text_final = gr.Textbox(label="Resultados Compilados", interactive=True, lines=10)
330
+
331
+
332
+ # Define what happens when you click the Submit button
333
+ submit_button.click(
334
+ fn=process_inputs,
335
+ inputs=[task_type, topic, integration_period, num_results],
336
+ outputs=[output_text_intermedio,output_text_final]
337
+ )
338
+
339
+ if __name__ == "__main__":
340
+ app.launch()