File size: 4,122 Bytes
88e348e f1b4891 88e348e f1b4891 88e348e 439d5b6 88e348e 439d5b6 88e348e 141b0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
from datetime import datetime
import json
import os
import pickle
from typing import List
from langchain.schema import Document
import pandas as pd
def create_files(social_media_data, hugg = False):
folder_path = 'Stock Sentiment Analysis/files'
if hugg:
folder_path = 'files'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# Save dictionary to a file
with open(folder_path+'/social_media_data.json', 'w') as f:
json.dump(social_media_data, f)
# Convert the data to a pandas DataFrame
df = pd.DataFrame(social_media_data)
df.head()
# Exporting the data to a CSV file
file_path = folder_path+"/social_media_data.csv"
df.to_csv(file_path, index=False)
df.to_pickle(folder_path+"/social_media_data.pkl")
def fetch_social_media_data(hugg = False):
file_path = 'Stock Sentiment Analysis/files/social_media_data.json'
if hugg:
file_path = 'files/social_media_data.json'
with open(file_path, 'r') as file:
data = json.load(file)
social_media_document = []
for item in data:
social_media_document.append(Document(
page_content=str(item["page_content"]),
metadata={"platform":item["platform"],
"company":item["company"],
"ingestion_timestamp":datetime.now().isoformat(),
"word_count":len(item["page_content"]["content"]),
"link":item["link"] if "link" in item else ""
}))
return social_media_document
def save_ingested_data(ingested_data):
# Save the list to a file
with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'wb') as file:
pickle.dump(ingested_data, file)
def save_analysed_data(analysed_data):
# Save the list to a file
with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'wb') as file:
pickle.dump(analysed_data, file)
def get_ingested_data():
# Load the list from the file
with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'rb') as file:
loaded_documents = pickle.load(file)
return loaded_documents
def get_analysed_data():
# Load the list from the file
with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'rb') as file:
loaded_documents = pickle.load(file)
return loaded_documents
def sample_documents(documents: List[Document], n: int) -> List[Document]:
"""
Samples `n` entries for each unique `"platform"` and `"company"` metadata combination from the input `Document[]`.
Args:
documents (List[Document]): The input list of `Document` objects.
n (int): The number of entries to sample for each unique metadata combination.
Returns:
List[Document]: A new list of `Document` objects, with `n` entries per unique metadata combination.
"""
# Create a dictionary to store the sampled documents per metadata combination
sampled_docs = {}
for doc in documents:
combo = (doc.metadata["platform"], doc.metadata["company"])
if combo not in sampled_docs:
sampled_docs[combo] = []
# Add the document to the list for its metadata combination, up to n entries
if len(sampled_docs[combo]) < n:
sampled_docs[combo].append(doc)
# Flatten the dictionary into a single list
return [doc for docs in sampled_docs.values() for doc in docs]
def to_documents(data) -> List[Document]:
social_media_document = []
for item in data:
social_media_document.append(Document(
page_content=str(item["page_content"]),
metadata={"platform":item["platform"],
"company":item["company"],
"ingestion_timestamp":datetime.now().isoformat(),
"word_count":len(item["page_content"]["content"]),
"link": item["link"] if "link" in item else ""
}))
return social_media_document |