File size: 4,122 Bytes
88e348e
 
 
 
 
 
 
 
 
 
f1b4891
 
 
 
88e348e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1b4891
 
 
 
 
88e348e
 
 
 
 
 
 
 
439d5b6
 
88e348e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439d5b6
 
88e348e
141b0a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110


from datetime import datetime
import json
import os
import pickle
from typing import List
from langchain.schema import Document
import pandas as pd

def create_files(social_media_data, hugg = False):
    folder_path = 'Stock Sentiment Analysis/files'
    if hugg:
        folder_path = 'files'

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        
    # Save dictionary to a file
    with open(folder_path+'/social_media_data.json', 'w') as f:
        json.dump(social_media_data, f)

    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(social_media_data)
    df.head()

    # Exporting the data to a CSV file
    file_path = folder_path+"/social_media_data.csv"
    df.to_csv(file_path, index=False)
        
    df.to_pickle(folder_path+"/social_media_data.pkl")

def fetch_social_media_data(hugg = False):
        file_path = 'Stock Sentiment Analysis/files/social_media_data.json'
        if hugg:
            file_path = 'files/social_media_data.json'
        with open(file_path, 'r') as file:
            data = json.load(file)
        social_media_document = []
        for item in data:
            social_media_document.append(Document(
                page_content=str(item["page_content"]), 
                metadata={"platform":item["platform"],
                          "company":item["company"],
                          "ingestion_timestamp":datetime.now().isoformat(),
                          "word_count":len(item["page_content"]["content"]),
                          "link":item["link"] if "link" in item else ""
                          }))
        return social_media_document
        
def save_ingested_data(ingested_data):
    # Save the list to a file
    with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'wb') as file:
        pickle.dump(ingested_data, file)

def save_analysed_data(analysed_data):
    # Save the list to a file
    with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'wb') as file:
        pickle.dump(analysed_data, file)

def get_ingested_data():
    # Load the list from the file
    with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'rb') as file:
        loaded_documents = pickle.load(file)
    return loaded_documents

def get_analysed_data():
    # Load the list from the file
    with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'rb') as file:
        loaded_documents = pickle.load(file)
    return loaded_documents

def sample_documents(documents: List[Document], n: int) -> List[Document]:
    """
    Samples `n` entries for each unique `"platform"` and `"company"` metadata combination from the input `Document[]`.

    Args:
        documents (List[Document]): The input list of `Document` objects.
        n (int): The number of entries to sample for each unique metadata combination.

    Returns:
        List[Document]: A new list of `Document` objects, with `n` entries per unique metadata combination.
    """
    # Create a dictionary to store the sampled documents per metadata combination
    sampled_docs = {}

    for doc in documents:
        combo = (doc.metadata["platform"], doc.metadata["company"])
        if combo not in sampled_docs:
            sampled_docs[combo] = []
        
        # Add the document to the list for its metadata combination, up to n entries
        if len(sampled_docs[combo]) < n:
            sampled_docs[combo].append(doc)

    # Flatten the dictionary into a single list
    return [doc for docs in sampled_docs.values() for doc in docs]

def to_documents(data) -> List[Document]:
    social_media_document = []
    for item in data:
        social_media_document.append(Document(
            page_content=str(item["page_content"]), 
            metadata={"platform":item["platform"],
                        "company":item["company"],
                        "ingestion_timestamp":datetime.now().isoformat(),
                        "word_count":len(item["page_content"]["content"]),
                        "link": item["link"] if "link" in item else ""
                        }))
    return social_media_document