File size: 2,805 Bytes
65224b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# src/data_processing.py
import os
import json
import csv
from pdfminer.high_level import extract_text
import pandas as pd
from utils import tokenize, build_vocab, save_vocab

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_pdf(file_path):
    return extract_text(file_path)

def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def read_csv(file_path):
    df = pd.read_csv(file_path)
    # Concatenate all text columns into a single string
    text = ' '.join(df.astype(str).values.flatten())
    return text

def process_file(file_path):
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    if ext == '.txt':
        return read_txt(file_path)
    elif ext == '.pdf':
        return read_pdf(file_path)
    elif ext == '.json':
        return read_json(file_path)
    elif ext == '.csv':
        return read_csv(file_path)
    else:
        print(f"Unsupported file format: {ext}")
        return None

def load_data(raw_data_dir='data/raw'):
    all_data = []
    for root, dirs, files in os.walk(raw_data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            data = process_file(file_path)
            if data:
                all_data.append(data)
    return all_data

def prepare_training_data(processed_data, vocab_path='vocab.json'):
    tokenized_texts = []
    for entry in processed_data:
        if isinstance(entry, str):
            tokens = tokenize(entry)
            tokenized_texts.append(tokens)
        elif isinstance(entry, list):
            for item in entry:
                if isinstance(item, str):
                    tokens = tokenize(item)
                    tokenized_texts.append(tokens)
    vocab = build_vocab(tokenized_texts)
    save_vocab(vocab, vocab_path)
    return tokenized_texts, vocab

def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)

def save_processed_data(processed_data, filepath='data/processed/processed_data.json'):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    print("Loading raw data...")
    data = load_data()
    print(f"Loaded {len(data)} data entries.")

    print("Preparing training data...")
    tokenized_texts, vocab = prepare_training_data(data)
    save_tokenized_data(tokenized_texts)
    save_processed_data(data)
    print("Data processing complete.")
    print(f"Vocabulary size: {len(vocab)}")