Spaces:
Sleeping
Sleeping
## Creating Brochure App that takes a website, fetches all links and retrieves data from all and then creates a brochure with all the data | |
from bs4 import BeautifulSoup | |
import requests | |
import json | |
import gradio as gr | |
from langchain_groq import ChatGroq | |
import os | |
# A class to represent a Webpage | |
class Website: | |
""" | |
A utility class to represent a Website that we have scraped, now with links | |
""" | |
def __init__(self, url): | |
self.url = url | |
response = requests.get(url) | |
self.body = response.content | |
soup = BeautifulSoup(self.body, 'html.parser') | |
self.title = soup.title.string if soup.title else "No title found" | |
if soup.body: | |
for irrelevant in soup.body(["script", "style", "img", "input"]): | |
irrelevant.decompose() | |
self.text = soup.body.get_text(separator="\n", strip=True) | |
else: | |
self.text = "" | |
links = [link.get('href') for link in soup.find_all('a')] | |
self.links = [link for link in links if link] | |
def get_contents(self): | |
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" | |
## Check whether the links are relevant or not | |
def get_links_system_prompt(website): | |
link_system_prompt = "You are provided with a list of links found on a webpage. \ | |
You are able to decide which of the links would be most relevant to include in a brochure about the company, \ | |
such as links to an About page, or a Company page, or Careers/Jobs pages.\n" | |
link_system_prompt += "You should respond in JSON as in this example, do not say anything else:" | |
link_system_prompt += """ | |
{ | |
"links": [ | |
{"type": "about page", "url": "https://full.url/goes/here/about"}, | |
{"type": "careers page": "url": "https://another.full.url/careers"} | |
] | |
} | |
""" | |
return link_system_prompt | |
def get_links_user_prompt(website): | |
user_prompt = f"Here is the list of links on the website of {website.url} - " | |
user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \ | |
Do not include Terms of Service, Privacy, email links.\n" | |
user_prompt += "Links (some might be relative links):\n" | |
user_prompt += "\n".join(website.links) | |
return user_prompt | |
def get_relevant_links(url, llm): | |
website=Website(url) | |
messages = [ | |
("system", get_links_system_prompt(website)), | |
("human", get_links_user_prompt(website)) | |
] | |
result = llm.invoke(messages) | |
return json.loads(result.content) | |
def get_all_details(url, llm): | |
result = "Landing page:\n" | |
result += Website(url).get_contents() | |
links = get_relevant_links(url, llm) | |
print("Found links:", links) | |
for link in links["links"]: | |
result += f"\n\n{link['type']}\n" | |
result += Website(link["url"]).get_contents() | |
return result | |
def get_brochure_system_prompt(): | |
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \ | |
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\ | |
Include details of company culture, customers and careers/jobs if you have the information." | |
return system_prompt | |
def get_brochure_user_prompt(company_name, url, llm): | |
user_prompt = f"You are looking at a company called: {company_name}\n" | |
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n" | |
user_prompt += get_all_details(url,llm) | |
user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters | |
return user_prompt | |
def create_brochure(company_name, url, llm_choice, groq_api_key): | |
# Set the Groq API key dynamically | |
os.environ["GROQ_API_KEY"] = groq_api_key | |
llm = ChatGroq(model=llm_options[llm_choice]) | |
messages = [ | |
("system", get_brochure_system_prompt()), | |
("human", get_brochure_user_prompt(company_name, url, llm)) | |
] | |
# Process the stream and return the result | |
return llm.invoke(messages).content | |
llm_options = { | |
"Gemma2":"gemma2-9b-it", | |
"LLama": "llama-3.2-3b-preview", | |
"Mixtral": "mixtral-8x7b-32768" | |
} | |
# Gradio interface | |
demo = gr.Interface( | |
fn=create_brochure, | |
inputs=[ | |
"text", # Company name input | |
"text", # URL input | |
gr.Dropdown(choices=list(llm_options.keys()), label="Select LLM"), # LLM selection | |
gr.Textbox(type="password", label="Enter Groq API Key") # API Key input | |
], | |
outputs="markdown", # Output format | |
title="Brochure Generator", | |
description="Generate brochures by selecting a company name, URL, and LLM. Provide your Groq API Key." | |
) | |
demo.launch() | |