import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
dataset = nlp.load_dataset('emotion')
Using custom data configuration default
#Get specific data from dataset
train = dataset['train']
val = dataset['validation']
def get_tweet(data):
# Get raw text and emotion label for the dataset
tweets = [x['text'] for x in data]
labels = [x['label'] for x in data]
return tweets, labels
tweets, emLabels = get_tweet(train)
#DEFINE RANDOM GLOBAL VARIABLE FOR TESTS
randomIndex = random.randint(0, len(tweets))
def print_tweet(t, l):
# Provided a list of tweets and a list of labels
# Print a random tweet and its corresponding label
print("tweet:\n", t[randomIndex])
print("label:", l[randomIndex])
print()
def print_tokenized(tokenizer, tweets):
#Print a random tweet and its corresponding tokenized version
print("tweet:\n", tweets[randomIndex])
print("tokenized:", tokenizer.texts_to_sequences(tweets)[randomIndex])
print()
print_tweet(tweets, emLabels)
tweet: i feel too overwhelmed to clean anything so i just let it all pile up until it makes my whole life feel like it is going to come crashing down around me and i am helpless to stop it label: surprise
#Tokenize the tweets
from tensorflow.keras.preprocessing.text import Tokenizer
#Initialize the tokenizer to only consider the top 10000 most frequently used words.
#Anything not in the top 10000 will be considered a oov and will be tokenized as <UNK>.
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
#Tokenize the tweets dataset that was previously imported
tokenizer.fit_on_texts(tweets)
#Test tokenizer..
print("======Tokenizer Test======")
print_tokenized(tokenizer, tweets)
======Tokenizer Test====== tweet: i feel too overwhelmed to clean anything so i just let it all pile up until it makes my whole life feel like it is going to come crashing down around me and i am helpless to stop it tokenized: [2, 3, 94, 238, 5, 1308, 166, 15, 2, 32, 207, 13, 36, 3939, 42, 332, 13, 152, 11, 258, 78, 3, 14, 13, 22, 87, 5, 182, 3234, 142, 128, 18, 4, 2, 24, 293, 5, 254, 13]
#Pad and truncate sequences from previous step
#We want to pad the sequences to the same length.
lengths = [len(tweet.split(' ')) for tweet in tweets]
#Analyze the histogram to determine a maximum length for the tweet
plt.hist(lengths)
plt.show()
maxlength = 50
#Begin padding the sequences to the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences
def get_sequences(tokenizer, tweets):
sequences = tokenizer.texts_to_sequences(tweets)
padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=maxlength)
return padded
padded_train_seq = get_sequences(tokenizer, tweets)
print("======Padded Test======")
print(tweets[randomIndex])
print(padded_train_seq[randomIndex])
print()
======Padded Test====== i feel too overwhelmed to clean anything so i just let it all pile up until it makes my whole life feel like it is going to come crashing down around me and i am helpless to stop it [ 2 3 94 238 5 1308 166 15 2 32 207 13 36 3939 42 332 13 152 11 258 78 3 14 13 22 87 5 182 3234 142 128 18 4 2 24 293 5 254 13 0 0 0 0 0 0 0 0 0 0 0]
#Prepare the labels
classes = set(emLabels)
#Helper arrays to convert a label to a numerical representation
class_to_index = dict((c, i) for i, c in enumerate(classes))
index_to_class = dict((v, k) for k, v in enumerate(classes))
#Convert our dataset labels to a numerical representation
names_to_ids = lambda labels: np.array([class_to_index.get(label) for label in labels])
train_labels = names_to_ids(emLabels)
print(class_to_index)
{'joy': 0, 'love': 1, 'fear': 2, 'anger': 3, 'surprise': 4, 'sadness': 5}
print(index_to_class)
{'joy': 0, 'love': 1, 'fear': 2, 'anger': 3, 'surprise': 4, 'sadness': 5}
#Build the model
#Create the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=maxlength),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(
loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.summary()
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 50, 16) 160000 bidirectional_4 (Bidirectio (None, 50, 40) 5920 nal) bidirectional_5 (Bidirectio (None, 40) 9760 nal) dense_2 (Dense) (None, 6) 246 ================================================================= Total params: 175,926 Trainable params: 175,926 Non-trainable params: 0 _________________________________________________________________
#Prepare validation set
val_tweets, val_labels = get_tweet(val)
val_seq = get_sequences(tokenizer, val_tweets)
val_labels = names_to_ids(val_labels)
#Train the model
history = model.fit(
padded_train_seq,
train_labels,
validation_data=(val_seq, val_labels),
epochs=10,
callbacks=[
tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)
]
)
Epoch 1/10 500/500 [==============================] - 23s 32ms/step - loss: 1.3104 - accuracy: 0.4797 - val_loss: 0.7866 - val_accuracy: 0.7160 Epoch 2/10 500/500 [==============================] - 21s 42ms/step - loss: 0.5673 - accuracy: 0.7934 - val_loss: 0.5548 - val_accuracy: 0.8070 Epoch 3/10 500/500 [==============================] - 23s 46ms/step - loss: 0.2760 - accuracy: 0.9104 - val_loss: 0.3871 - val_accuracy: 0.8750 Epoch 4/10 500/500 [==============================] - 26s 52ms/step - loss: 0.1677 - accuracy: 0.9467 - val_loss: 0.3903 - val_accuracy: 0.8750 Epoch 5/10 500/500 [==============================] - 34s 67ms/step - loss: 0.1224 - accuracy: 0.9623 - val_loss: 0.3785 - val_accuracy: 0.8900 Epoch 6/10 500/500 [==============================] - 30s 60ms/step - loss: 0.1014 - accuracy: 0.9686 - val_loss: 0.3640 - val_accuracy: 0.8855 Epoch 7/10 500/500 [==============================] - 31s 62ms/step - loss: 0.0910 - accuracy: 0.9715 - val_loss: 0.3724 - val_accuracy: 0.8920 Epoch 8/10 500/500 [==============================] - 29s 58ms/step - loss: 0.0865 - accuracy: 0.9712 - val_loss: 0.3726 - val_accuracy: 0.8945 Epoch 9/10 500/500 [==============================] - 34s 67ms/step - loss: 0.0739 - accuracy: 0.9762 - val_loss: 0.3533 - val_accuracy: 0.8980 Epoch 10/10 500/500 [==============================] - 38s 77ms/step - loss: 0.0576 - accuracy: 0.9814 - val_loss: 0.3788 - val_accuracy: 0.8830
def show_history(h):
epochs_trained = len(h.history['loss'])
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
plt.ylim([0., 1.])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
def show_confusion_matrix(y_true, y_pred, classes):
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, normalize='true')
plt.figure(figsize=(8, 8))
sp = plt.subplot(1, 1, 1)
ctx = sp.matshow(cm)
plt.xticks(list(range(0, 6)), labels=classes)
plt.yticks(list(range(0, 6)), labels=classes)
plt.colorbar(ctx)
plt.show()
#Results
show_history(history)
#Test the model with a different dataset of tweets:
test_dataset = dataset['test']
test_tweets, test_labels = get_tweet(test_dataset)
#Evaluate the model on the test dataset.
#test_tweets is the dataset that is being predicted.
test_seq = get_sequences(tokenizer, test_tweets)
test_labels = names_to_ids(test_labels)
_ = model.evaluate(test_seq, test_labels)
63/63 [==============================] - 1s 19ms/step - loss: 0.3623 - accuracy: 0.8815
#Print the confusion matrix to check the accuracy of the model
predictions = model.predict(test_seq)
predictions = np.argmax(predictions, axis=-1)
show_confusion_matrix(test_labels, predictions, index_to_class.values())
63/63 [==============================] - 3s 12ms/step