Spaces:

EddyGiusepe
/

LangChain_ChatBot_RAG

No application file

+[tool.poetry]
+name = "ddua-embeddings"
+version = "0.1.0"
+description = "This project is a chatbot application that LangChain, OpenAI and FAISS to talk to a the blog Diario Di Un Analista.it"
+authors = ["Andrea D'Agostino <andrea@diariodiunanalista.it>"]
+readme = "README.md"
+packages = [ { include = "src", from = "." } ]
+[tool.poetry.dependencies]
+python = ">=3.10,<4.0"
+streamlit = "^1.28.2"
+langchain = "^0.0.339"
+openai = "^1.3.5"
+trafilatura = "^1.6.2"
+python-dotenv = "^1.0.0"
+pandas = "^2.1.3"
+ruff = "^0.1.6"
+tiktoken = "^0.5.1"
+instructorembedding = "^1.0.1"
+faiss-cpu = "^1.7.4"
+watchdog = "^3.0.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

src/modules/scraper.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Instalar os pacotes, assim:
+$ poetry install
+"""
+import time
+import pandas as pd
+from tqdm import tqdm
+from trafilatura.sitemaps import sitemap_search
+from trafilatura import fetch_url, extract, extract_metadata
+def get_urls_from_sitemap(resource_url: str) -> list:
+    """
+    Função que cria um DataFrame Pandas de URLs e artigos.
+    """
+    urls = sitemap_search(resource_url)
+    return urls
+def extract_article(url: str) -> dict:
+    """
+    Estrae un articolo da una URL con Trafilatura
+    """
+    downloaded = fetch_url(url)
+    article = extract(downloaded, favor_precision=True, only_with_metadata=True)
+    metadata = extract_metadata(downloaded)
+    return article, metadata
+def create_dataset(list_of_websites: list) -> pd.DataFrame:
+    """
+    Funzione che crea un DataFrame Pandas di URL e articoli.
+    """
+    data = []
+    for website in tqdm(list_of_websites, desc="Websites"):
+        urls = get_urls_from_sitemap(website)
+        for url in tqdm(urls, desc="URLs"):
+            article, metadata = extract_article(url)
+            d = {
+                "url": url,
+                "article": article,
+                "title": metadata.title,
+                "description": metadata.description,
+                "author": metadata.author,
+                "date": metadata.date,
+            }
+            data.append(d)
+            time.sleep(0.5)
+    df = pd.DataFrame(data)
+    df = df.drop_duplicates()
+    df = df.dropna()
+    return df
+if __name__ == "__main__":
+    list_of_websites = [
+        "https://www.diariodiunanalista.it/",
+    ]
+    df = create_dataset(list_of_websites)
+    df.to_csv("./data/articles.csv", index=False)