Spaces:

LAPINILLA
/

Mayordomo

Runtime error

App Files Files Community

LAPINILLA commited on May 5

Commit

018fbde

•

1 Parent(s): 0a4f5bb

Create app.py

Browse files

Files changed (1) hide show

app.py +340 -0

app.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import os.path
+import openai
+from openai import OpenAI
+import os
+from groq import Groq
+import requests
+import time
+from html.parser import HTMLParser
+from bs4 import BeautifulSoup
+import json
+from datetime import datetime
+import pandas as pd
+from serpapi import GoogleSearch
+import gradio as gr
+GROQ_API_KEY=getenv("GROQ_API_KEY")
+client_groq = Groq(api_key=GROQ_API_KEY,)
+openai_key=getenv("OPENAI_API_KEY")
+os.environ["OPENAI_API_KEY"] = openai_key
+client = OpenAI()
+SERPAPI_KEY=getenv("SERPAPI_KEY")
+api_key = os.getenv("API_KEY")
+def scrape_website(url):
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    try:
+        response = requests.get(url, headers=headers, timeout=20)
+        response.encoding = response.apparent_encoding  # Set encoding to match the content
+        if response.status_code == 200:
+            page_content = response.content
+            soup = BeautifulSoup(page_content, 'html.parser')
+            paragraphs = soup.find_all('p')
+            scraped_data = [p.get_text() for p in paragraphs]
+            formatted_data = u"\n".join(scraped_data)
+            return formatted_data  # Return only content
+        else:
+            return "Failed to retrieve the webpage (Status Code: {})".format(response.status_code)
+    except requests.exceptions.ReadTimeout:
+        # Handle the timeout exception
+        return "Request timed out after 20 seconds."
+    except requests.exceptions.SSLError as e:
+        return "Request Error: {}".format(e)
+    except requests.exceptions.RequestException as e:
+        # Handle other requests-related exceptions
+        return "An error occurred: {}".format(e)
+def update_dataframe_with_results(organic_results):
+    # Prepare data for DataFrame
+    max_chars = 100000  # Maximum characters allowed in a cell
+    data = []
+    for result in organic_results:
+        # Scrape the website content
+        scraped_content = scrape_website(result.get('link'))
+        # Truncate the content if it exceeds the limit
+        if len(scraped_content) > max_chars:
+          scraped_content = scraped_content[:max_chars]
+        data.append({
+            "Title": result.get('title'),
+            "Link": result.get('link'),
+            "Snippet": result.get('snippet'),
+            "Displayed Link": result.get('displayed_link'),
+            "Date": result.get('date'),  # Might not always be present
+            "Rich Snippet": result.get('rich_snippet'),  # Might not always be present
+            "Scraped Content": scraped_content  # Add scraped content
+        })
+    df = pd.DataFrame(data)
+    return df
+def opencall(text,user_query):
+    print("Calling opencall function with", len(text), "characters")
+    #completion = client_groq.chat.completions.create(
+    completion = client.chat.completions.create(
+            model="gpt-4-0125-preview",
+            #model="mixtral-8x7b-32768",
+            temperature=0.1,
+            messages=[
+               {"role": "system", "content": "You are a helpful assistant, specialised in preparing contents for preparing a presentation."},
+                {"role": "system", "content": "Your task is to prepare a base report on the topics, themes and trends addressed in the latest conferences, seminars and symposiums." },
+                {"role": "system", "content": "For this matter I will be providing you in the Information Pool a compilation of several scraped google search results from the latest conferences, seminars and symposiums on the topic: "+user_query},
+                {"role": "system", "content": "Each piece of Scraped Content start with the tag '### Title:' indicating the title, followed by the URL reference '### Link:' , followed by the contents '### Content:'"},
+                {"role": "system", "content": "Process all the information in the Information Pool to provide:"},
+               {"role": "system", "content": "1) Perspective of Relevant Information: Assess and extract the most relevant information from the point of view of this aspect: "+user_query+"."},
+                 {"role": "system", "content": "2) Perspective of Key Topics: Highlight the key topics and themes.Cite the URLs that source those topics and themes"},
+                  {"role": "system", "content": "3) Perspective of Emergent Trends: Highlight the emergent trends.Cite the URLs that source those trends."},
+                  {"role": "system", "content": "In the response, use the indicated structure of 1)Perspective of Relevant Information 2)Perspective of Key Topics 3)Perspective of Emergent Trends"},
+                {"role": "user", "content":"Information Pool:"+text}
+            ]
+        )
+    response = completion.choices[0].message.content
+    response = response + "\n" + "XXXXX" + "\n"
+    return response
+def split_large_content(content, max_length=30000):
+    # Extract the title and source URL, assuming they end with the second newline
+    title_and_source_end = content.find('\n\n') + 2
+    title_and_source = content[:title_and_source_end]
+    title_and_source_length = len(title_and_source)
+    # Ensure each segment has space for the title and source by reducing max_length
+    max_segment_length = max_length - title_and_source_length
+    segments = []
+    content_body = content[title_and_source_end:]
+    # Start splitting the content_body into segments
+    while len(content_body) > 0:
+        # Take a slice of content up to max_segment_length
+        segment = content_body[:max_segment_length]
+        # If we're not at the end of content_body, back-track to the last complete word
+        if len(content_body) > max_segment_length:
+            last_space = segment.rfind(' ')
+            segment = segment[:last_space]
+        # Add the title and source URL to the start of this segment
+        full_segment = title_and_source + segment
+        segments.append(full_segment)
+        # Move forward in content_body by the length of the segment minus the title/source
+        content_body = content_body[len(segment):]
+    return segments
+def main(df,google_search_query):
+    # Initialize a string to accumulate the information
+    information_pool = ""
+    archivo1=""
+    # Open or create a plain text file in append mode
+    with open('respuestas.txt', mode='a', encoding='utf-8') as file:
+        # Iterate over the rows of the DataFrame
+        for index, row in df.iterrows():
+            # Combine title, link, and content into a single string
+            document_name = row['Title']  # Using title as document_name
+            raw_content = str(row['Scraped Content'])  # Convert to string to ensure compatibility
+            link = row['Link']  # Retrieve link for additional usage or logging
+            # Assuming process_document_content is a function you've defined to process the content
+            processed_content = "### Title: " + row['Title'] + "\n" + "### Link: " + row['Link'] + "\n" + "### Content: " + str(row['Scraped Content']) + "\n" + "\n"
+            print(document_name, ":", len(processed_content))
+            #print("Contenido:", processed_content)
+            print("acumulado:", len(information_pool + processed_content))
+            # Handle long content by splitting and processing in segments
+            if len(processed_content) > 30000:
+                content_segments = split_large_content(processed_content)
+                for segment in content_segments:
+                    print("EN C, Nuevo valor de Text:", len(segment))
+                    #print("segmen:",segment)
+                    response = opencall(segment,google_search_query)  # Replace 'opencall' with your actual function call
+                    archivo1=archivo1+response+'\n'
+                    file.write(response + '\n')
+            else:
+                # Check if adding processed content exceeds the size limit
+                if len(information_pool + processed_content) <= 30000:
+                    information_pool += processed_content
+                    print("EN A, Nuevo valor de Text:", len(information_pool))
+                else:
+                    # Process current accumulated content and start new accumulation
+                    print("EN B1, llamando con valor de Text:", len(information_pool))
+                    #print("Information pool", information_pool)
+                    response = opencall(information_pool,google_search_query)
+                    file.write(response + '\n')
+                    archivo1=archivo1+response+'\n'
+                    information_pool = processed_content
+                    print("EN B2, nuevo valor de Text:", len(information_pool), " Con documento:", document_name)
+        # Handle any remaining content after loop
+        if information_pool:
+            print("Final call")
+            response = opencall(information_pool,google_search_query)
+            file.write(response + '\n')
+            archivo1=archivo1+response+'\n'
+    return archivo1
+def rearrange_text(text):
+    # Split the text into batches using 'XXXXX'
+    batches = text.split('XXXXX')
+    # Initialize variables to store concatenated texts
+    all_texta = ""
+    all_textb = ""
+    all_textc = ""
+    # Define markers for different sections
+    markers = {
+        'texta_marker': "Perspective of Relevant Information",
+        'textb_marker': "Perspective of Key Emerging Aspects",
+        'textc_marker': "Perspective of Key Entities"
+    }
+    # Process each batch
+    for batch in batches:
+        # Initialize indices for each section
+        texta_start = batch.find(markers['texta_marker'])
+        textb_start = batch.find(markers['textb_marker'])
+        textc_start = batch.find(markers['textc_marker'])
+        # Extract TEXTA, TEXTB, and TEXTC using the found indices
+        # Check if the markers are found; if not, skip to the next marker
+        texta = batch[texta_start:textb_start] if textb_start != -1 else batch[texta_start:]
+        textb = batch[textb_start:textc_start] if textc_start != -1 else batch[textb_start:]
+        textc = batch[textc_start:]
+        # Remove the markers from the beginning of each text
+        texta = texta.replace(markers['texta_marker'], '').strip()
+        textb = textb.replace(markers['textb_marker'], '').strip()
+        textc = textc.replace(markers['textc_marker'], '').strip()
+        # Concatenate texts from all batches
+        all_texta += "\n" + texta if all_texta else texta
+        all_textb += "\n" + textb if all_textb else textb
+        all_textc += "\n" + textc if all_textc else textc
+    # You can now use all_texta, all_textb, and all_textc for further summarization or processing
+    return all_texta, all_textb, all_textc
+def resumen(text):
+    texta, textb, textc = rearrange_text(text)
+    completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
+         {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
+         {"role": "system", "content": "Your task is to provide an integrated comprehensive 2000 words narrative of the different points indicated in the Information Pool text for a internal report on recent news." },
+         {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
+         {"role": "system", "content": "Be exhaustive, comprehensive and detailed in addressing the relation of different points indicated in the Information Pool text." },
+         {"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts, integrating them with a fluent narrative." },
+         {"role": "system", "content": "Start directly with the narrative, do not introduce the text, as it is part of a broader report." },
+         {"role": "system", "content": "Use a formal writing style, yet plain and easy to read. Avoid pomposity and making up artificial descriptions. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
+         {"role": "user", "content":"Information Pool:"+texta}  ] )
+    response1 = completion.choices[0].message.content if completion.choices[0].message else ""
+    response_1="1) Perspective of Relevant Information:"+"\n"+response1+"\n"
+    completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
+         {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
+         {"role": "system", "content": "Your task is to provide a comprehensive and integrated relation of about 2000 words in length of the different emerging aspects indicated in the Information Pool text for a internal report on recent news." },
+         {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
+         {"role": "system", "content": "Be exhaustive, comprehensive and detailed in the relation." },
+         {"role": "system", "content": "Arrange paragraphs and information around each entity or related entities and concepts." },
+         {"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
+         {"role": "system", "content": "Use a formal writing style, yet plain and easy to read. The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
+         {"role": "user", "content":"Information Pool:"+textb}  ] )
+    response2 = completion.choices[0].message.content if completion.choices[0].message else ""
+    response_2=" 2)Perspective of Key emerging aspects:"+"\n"+response2+"\n"
+    completion = client.chat.completions.create(model="gpt-4-0125-preview",temperature=0.5, messages=[
+         {"role": "system", "content": "You are a helpful assistant, specialised in composing and integrating information."},
+         {"role": "system", "content": "Your task is to consolidate and sore the relation of the different entities indicated in the Information Pool text for a internal report on recent news." },
+         {"role": "system", "content": "Instructions. Elaborate the text following these rules:" },
+         {"role": "system", "content": "Be exhaustive in the sorting. Sort around similar entry types: Organization, Program, Technology, Entity, ... You can merge similar entry types (i.e. Technologies and Technology Terms and Concepts, People and Officials,...)" },
+         {"role": "system", "content": "Arrange and integrate entries around similar or related concepts. Discard duplicated concepts or elements." },
+         {"role": "system", "content": "Start directly with the relation, do not introduce the text, as it is part of a broader report." },
+         {"role": "system", "content": "The audience is well acquainted with technical and defence/military vocabulary, information and entities. " },
+         {"role": "user", "content":"Information Pool:"+textc}  ] )
+    response3 = completion.choices[0].message.content if completion.choices[0].message else ""
+    response_3=" 3)Perspective of of Key Entities"+"\n"+response3+"\n"
+    compilacion=response_1+"\n"+response_2+"\n"+response_3
+    print(compilacion)
+    print("\n\n")
+    print("\n\n")
+    return compilacion
+# Define the function to get news results
+def get_organic_results(query, periodo_tbs, num_results):
+    params = {
+        "q": query,
+        "num": str(num_results),
+        "tbs": periodo_tbs,  # quiero los resultados del último año
+        "api_key": SERPAPI_KEY
+    }
+    search = GoogleSearch(params)
+    results = search.get_dict()
+    organic_results = results.get("organic_results", [])  # Change from "news_results" to "organic_results"
+    for result in organic_results:
+        title = result.get('title')
+        print("Title:", title)
+        print()  # Print a newline for better readability between results
+    return organic_results
+def process_inputs(task_type, topic, integration_period, num_results):
+    # Construct the query based on user input
+    google_search_query = f'"{topic}" Conferences OR seminars OR SYMPOSIUMS'
+    periodo_tbs = integration_period
+    num_resultados = int(num_results)
+    # Fetch results based on the user's query
+    results = get_organic_results(google_search_query, periodo_tbs, num_resultados)
+    df = update_dataframe_with_results(results)
+    archivo1 = main(df, google_search_query)
+    resumen_text = resumen(archivo1)
+    return archivo1,resumen_text
+# Create the Gradio blocks interface
+with gr.Blocks() as app:
+    with gr.Row():
+        with gr.Column():
+            task_type = gr.Dropdown(choices=["Conferencias", "Seminarios", "Simposios"], label="Selecciona el tipo de tarea:")
+            topic = gr.Textbox(label="Aspecto o Tema sobre el que trabajar", placeholder="Ingrese el tema aquí...")
+            integration_period = gr.Dropdown(choices=["1M", "3M", "6M", "1Y"], label="Periodo de integración de información")
+            num_results = gr.Number(label="Número de resultados sobre los que trabajar", value=10)
+            submit_button = gr.Button("Submit")
+        with gr.Column():
+            output_text_intermedio = gr.Textbox(label="Resultados Intermedios", interactive=True, lines=10)
+            output_text_final = gr.Textbox(label="Resultados Compilados", interactive=True, lines=10)
+    # Define what happens when you click the Submit button
+    submit_button.click(
+        fn=process_inputs,
+        inputs=[task_type, topic, integration_period, num_results],
+        outputs=[output_text_intermedio,output_text_final]
+    )
+if __name__ == "__main__":
+    app.launch()