Spaces:
No application file
No application file
EddyGiusepe
commited on
Commit
•
6ac8795
1
Parent(s):
28c581e
Usando LangChain para RAG e com Websites
Browse files- .gitignore +3 -0
- poetry.lock +0 -0
- pyproject.toml +26 -0
- src/modules/scraper.py +66 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# EddyGiusepe
|
2 |
+
venv_chatbotRAG/
|
3 |
+
.env
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "ddua-embeddings"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "This project is a chatbot application that LangChain, OpenAI and FAISS to talk to a the blog Diario Di Un Analista.it"
|
5 |
+
authors = ["Andrea D'Agostino <andrea@diariodiunanalista.it>"]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [ { include = "src", from = "." } ]
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = ">=3.10,<4.0"
|
11 |
+
streamlit = "^1.28.2"
|
12 |
+
langchain = "^0.0.339"
|
13 |
+
openai = "^1.3.5"
|
14 |
+
trafilatura = "^1.6.2"
|
15 |
+
python-dotenv = "^1.0.0"
|
16 |
+
pandas = "^2.1.3"
|
17 |
+
ruff = "^0.1.6"
|
18 |
+
tiktoken = "^0.5.1"
|
19 |
+
instructorembedding = "^1.0.1"
|
20 |
+
faiss-cpu = "^1.7.4"
|
21 |
+
watchdog = "^3.0.0"
|
22 |
+
|
23 |
+
|
24 |
+
[build-system]
|
25 |
+
requires = ["poetry-core"]
|
26 |
+
build-backend = "poetry.core.masonry.api"
|
src/modules/scraper.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
Instalar os pacotes, assim:
|
4 |
+
|
5 |
+
$ poetry install
|
6 |
+
"""
|
7 |
+
import time
|
8 |
+
import pandas as pd
|
9 |
+
from tqdm import tqdm
|
10 |
+
from trafilatura.sitemaps import sitemap_search
|
11 |
+
from trafilatura import fetch_url, extract, extract_metadata
|
12 |
+
|
13 |
+
|
14 |
+
def get_urls_from_sitemap(resource_url: str) -> list:
|
15 |
+
"""
|
16 |
+
Função que cria um DataFrame Pandas de URLs e artigos.
|
17 |
+
"""
|
18 |
+
urls = sitemap_search(resource_url)
|
19 |
+
return urls
|
20 |
+
|
21 |
+
|
22 |
+
def extract_article(url: str) -> dict:
|
23 |
+
"""
|
24 |
+
Estrae un articolo da una URL con Trafilatura
|
25 |
+
"""
|
26 |
+
downloaded = fetch_url(url)
|
27 |
+
article = extract(downloaded, favor_precision=True, only_with_metadata=True)
|
28 |
+
metadata = extract_metadata(downloaded)
|
29 |
+
return article, metadata
|
30 |
+
|
31 |
+
|
32 |
+
def create_dataset(list_of_websites: list) -> pd.DataFrame:
|
33 |
+
"""
|
34 |
+
Funzione che crea un DataFrame Pandas di URL e articoli.
|
35 |
+
"""
|
36 |
+
data = []
|
37 |
+
for website in tqdm(list_of_websites, desc="Websites"):
|
38 |
+
urls = get_urls_from_sitemap(website)
|
39 |
+
for url in tqdm(urls, desc="URLs"):
|
40 |
+
article, metadata = extract_article(url)
|
41 |
+
d = {
|
42 |
+
"url": url,
|
43 |
+
"article": article,
|
44 |
+
"title": metadata.title,
|
45 |
+
"description": metadata.description,
|
46 |
+
"author": metadata.author,
|
47 |
+
"date": metadata.date,
|
48 |
+
}
|
49 |
+
data.append(d)
|
50 |
+
time.sleep(0.5)
|
51 |
+
|
52 |
+
df = pd.DataFrame(data)
|
53 |
+
df = df.drop_duplicates()
|
54 |
+
df = df.dropna()
|
55 |
+
|
56 |
+
return df
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
list_of_websites = [
|
61 |
+
"https://www.diariodiunanalista.it/",
|
62 |
+
]
|
63 |
+
|
64 |
+
df = create_dataset(list_of_websites)
|
65 |
+
|
66 |
+
df.to_csv("./data/articles.csv", index=False)
|