Spaces:
Sleeping
Sleeping
File size: 4,850 Bytes
32063f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
## Creating Brochure App that takes a website, fetches all links and retrieves data from all and then creates a brochure with all the data
from bs4 import BeautifulSoup
import requests
import json
import gradio as gr
from langchain_groq import ChatGroq
import os
# A class to represent a Webpage
class Website:
"""
A utility class to represent a Website that we have scraped, now with links
"""
def __init__(self, url):
self.url = url
response = requests.get(url)
self.body = response.content
soup = BeautifulSoup(self.body, 'html.parser')
self.title = soup.title.string if soup.title else "No title found"
if soup.body:
for irrelevant in soup.body(["script", "style", "img", "input"]):
irrelevant.decompose()
self.text = soup.body.get_text(separator="\n", strip=True)
else:
self.text = ""
links = [link.get('href') for link in soup.find_all('a')]
self.links = [link for link in links if link]
def get_contents(self):
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
## Check whether the links are relevant or not
def get_links_system_prompt(website):
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example, do not say anything else:"
link_system_prompt += """
{
"links": [
{"type": "about page", "url": "https://full.url/goes/here/about"},
{"type": "careers page": "url": "https://another.full.url/careers"}
]
}
"""
return link_system_prompt
def get_links_user_prompt(website):
user_prompt = f"Here is the list of links on the website of {website.url} - "
user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
user_prompt += "Links (some might be relative links):\n"
user_prompt += "\n".join(website.links)
return user_prompt
def get_relevant_links(url, llm):
website=Website(url)
messages = [
("system", get_links_system_prompt(website)),
("human", get_links_user_prompt(website))
]
result = llm.invoke(messages)
return json.loads(result.content)
def get_all_details(url, llm):
result = "Landing page:\n"
result += Website(url).get_contents()
links = get_relevant_links(url, llm)
print("Found links:", links)
for link in links["links"]:
result += f"\n\n{link['type']}\n"
result += Website(link["url"]).get_contents()
return result
def get_brochure_system_prompt():
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."
return system_prompt
def get_brochure_user_prompt(company_name, url, llm):
user_prompt = f"You are looking at a company called: {company_name}\n"
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
user_prompt += get_all_details(url,llm)
user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters
return user_prompt
def create_brochure(company_name, url, llm_choice, groq_api_key):
# Set the Groq API key dynamically
os.environ["GROQ_API_KEY"] = groq_api_key
llm = ChatGroq(model=llm_options[llm_choice])
messages = [
("system", get_brochure_system_prompt()),
("human", get_brochure_user_prompt(company_name, url, llm))
]
# Process the stream and return the result
return llm.invoke(messages).content
llm_options = {
"Gemma2":"gemma2-9b-it",
"LLama": "llama-3.2-3b-preview",
"Mixtral": "mixtral-8x7b-32768"
}
# Gradio interface
demo = gr.Interface(
fn=create_brochure,
inputs=[
"text", # Company name input
"text", # URL input
gr.Dropdown(choices=list(llm_options.keys()), label="Select LLM"), # LLM selection
gr.Textbox(type="password", label="Enter Groq API Key") # API Key input
],
outputs="markdown", # Output format
title="Brochure Generator",
description="Generate brochures by selecting a company name, URL, and LLM. Provide your Groq API Key."
)
demo.launch()
|