|
import csv |
|
import numpy as np |
|
import nltk |
|
|
|
words_to_nums = {} |
|
|
|
def get_data_for_training(filename): |
|
raw_data = open(filename, 'rt') |
|
reader = csv.reader(raw_data, delimiter=',') |
|
return list(reader) |
|
|
|
|
|
def get_data_and_labels(raw_data): |
|
labels = np.array(raw_data) |
|
labels = np.delete(labels, (0), axis=0) |
|
labels = np.delete(labels, (0), axis=1) |
|
labels = labels[:, 0] |
|
for i, label in enumerate(labels): |
|
labels[i] = 1 if (label == 'positive') else 0 |
|
labels = np.array(labels).astype('int') |
|
del raw_data[0] |
|
for j in raw_data: |
|
del j[0] |
|
del j[0] |
|
for i in range(len(raw_data)): |
|
raw_data[i] = nltk.word_tokenize(raw_data[i][0]) |
|
return raw_data, labels |
|
|
|
|
|
def get_word_embeddings(sentences): |
|
counter = 0 |
|
data = [] |
|
for words in sentences: |
|
num = [] |
|
for word in words: |
|
if word not in words_to_nums: |
|
words_to_nums[word] = counter |
|
num.append(counter) |
|
counter = counter+1 |
|
else: |
|
num.append(words_to_nums[word]) |
|
data.append(num) |
|
|
|
data = np.array(data, dtype=object) |
|
return data |
|
|
|
|
|
def vectorize_sequence(sequences, dimensions): |
|
results = np.zeros((len(sequences), dimensions)) |
|
for i, sequence in enumerate(sequences): |
|
results[i, sequence] = 1. |
|
return results |
|
|
|
def get_sequence(text): |
|
text_input = nltk.word_tokenize(text) |
|
sequence = [] |
|
for word in text_input: |
|
if word not in words_to_nums: |
|
continue |
|
else: |
|
sequence.append(words_to_nums[word]) |
|
testdata = [] |
|
testdata.append(sequence) |
|
sequence = np.array(testdata) |
|
return sequence |