File size: 2,331 Bytes
8fcd0f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai import OpenAIEmbeddings
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv() # This loads the variables from .env
openai_api_key = os.getenv('openai_api_key')
def pdf_summary(ocr_results_folder):
#openai_api_key = "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"
loader = DirectoryLoader(ocr_results_folder, glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()
page_contents = [doc.page_content for doc in docs]
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
embeddings = embeddings_model.embed_documents(page_contents)
X = np.array(embeddings)
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
centroids = kmeans.cluster_centers_
centroids_pca = pca.transform(centroids)
closest_point_indices = find_closest_point_indices(X, centroids, 1)
extracted_contents = [page_contents[index[0]] for index in closest_point_indices[:num_clusters]]
prompt = ChatPromptTemplate.from_template("Summarize the article based on the texts provided from four aspects: Goal, Method, Results, and Conclusion: {topic}")
model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai_api_key)
output_parser = StrOutputParser()
chain = prompt | model | output_parser
results = chain.invoke({"topic": ' '.join(extracted_contents)})
return results
def find_closest_point_indices(X, centroids, num_points=1):
closest_indices = []
for center in centroids:
# Calculating Euclidean distances from each point in X to the centroid
distances = np.linalg.norm(X - center, axis=1)
# Getting the indices of the closest 'num_points' points
closest_idx = np.argsort(distances)[:num_points]
# Adding the indices of the closest points for this centroid
closest_indices.append(closest_idx)
return closest_indices |