# src/data_processing.py import os import json import csv from pdfminer.high_level import extract_text import pandas as pd from utils import tokenize, build_vocab, save_vocab def read_txt(file_path): with open(file_path, 'r', encoding='utf-8') as file: return file.read() def read_pdf(file_path): return extract_text(file_path) def read_json(file_path): with open(file_path, 'r', encoding='utf-8') as file: return json.load(file) def read_csv(file_path): df = pd.read_csv(file_path) # Concatenate all text columns into a single string text = ' '.join(df.astype(str).values.flatten()) return text def process_file(file_path): _, ext = os.path.splitext(file_path) ext = ext.lower() if ext == '.txt': return read_txt(file_path) elif ext == '.pdf': return read_pdf(file_path) elif ext == '.json': return read_json(file_path) elif ext == '.csv': return read_csv(file_path) else: print(f"Unsupported file format: {ext}") return None def load_data(raw_data_dir='data/raw'): all_data = [] for root, dirs, files in os.walk(raw_data_dir): for file in files: file_path = os.path.join(root, file) data = process_file(file_path) if data: all_data.append(data) return all_data def prepare_training_data(processed_data, vocab_path='vocab.json'): tokenized_texts = [] for entry in processed_data: if isinstance(entry, str): tokens = tokenize(entry) tokenized_texts.append(tokens) elif isinstance(entry, list): for item in entry: if isinstance(item, str): tokens = tokenize(item) tokenized_texts.append(tokens) vocab = build_vocab(tokenized_texts) save_vocab(vocab, vocab_path) return tokenized_texts, vocab def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'): with open(filepath, 'w', encoding='utf-8') as f: json.dump(tokenized_texts, f, ensure_ascii=False, indent=4) def save_processed_data(processed_data, filepath='data/processed/processed_data.json'): with open(filepath, 'w', encoding='utf-8') as f: json.dump(processed_data, f, ensure_ascii=False, indent=4) if __name__ == "__main__": print("Loading raw data...") data = load_data() print(f"Loaded {len(data)} data entries.") print("Preparing training data...") tokenized_texts, vocab = prepare_training_data(data) save_tokenized_data(tokenized_texts) save_processed_data(data) print("Data processing complete.") print(f"Vocabulary size: {len(vocab)}")