run KISS: Using LSTM For IMDB Prediction

In this post we present an example of analysis of of IMDB movies review to predict if the review is positive or negative.

This post is base on the following:

Unlike previous pytorch applications, I've had a reaaly bad times until I got it working. This is because torchtext development has stopped, and because the embedding expects input in weird structure where the first dimesion is the sentense max length instead of any other normal implementations where the first dimension is the number of samples (the size of the batch).

This runs for enourmous time, so don't expect it to ever complete running on your latop, unless you change the size limit.

Notice the embedding vector in this case in part of the learnt network, and this is NOT a pre-trained model. Hence the network learns how to embed words to vector. This is rarely done in these days, and to redice cost other technics are used, such as Word2Vec.

Before running this, make sure to install teh requirements, especially spacy (the tokenizer):

pip3 install torch
pip3 spacy
python -m spacy download en_core_web_sm

import time

import numpy as np
import spacy
import torch.utils.data
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader

print("loading nlp")
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm

nlp = en_core_web_sm.load()


class LstmNetwork(torch.nn.Module):

    def __init__(self, vocabulary_size, embedding_dimension, hidden_dimension, output_dimension):
        super().__init__()

        self.embedding = torch.nn.Embedding(vocabulary_size, embedding_dimension)
        self.lstm = torch.nn.LSTM(embedding_dimension, hidden_dimension)
        self.fully_connected = torch.nn.Linear(hidden_dimension, output_dimension)

    def forward(self, text):
        # embedding expects first dimension as sentence length
        text = torch.transpose(text, 0, 1)

        # text dim: [sentence length, batch size]

        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]

        output, (hidden, cell) = self.lstm(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]

        output = self.fully_connected(hidden)
        return output


class Trainer:
    def __init__(self):
        # hyper parameters
        self.vocabulary_max_size = 1000

        self.learning_rate = 0.005
        self.max_words_per_sample = 500

        self.batch_size = 128
        self.number_of_epochs = 10

        self.embedding_dimension = 32
        self.hidden_dimension = 32
        self.number_of_classes = 2

        limit_size = None
        # limit_size = 1000

        self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        print('loading dataset')
        imdb = load_dataset("imdb")
        test_data = imdb["test"]
        train_data = imdb["train"]

        print('train data size', len(train_data))
        print('test data size', len(test_data))

        if limit_size:
            train_data = train_data.select(range(limit_size))
            test_data = test_data.select(range(limit_size))

        self.all_labels = {}
        self.all_vocabulary = {}

        self.process_dataset(train_data, self.pre_process)

        print('vocabulary size', len(self.all_vocabulary))
        print('labels', self.all_labels)

        # predefined word indexes
        # 0 - for padding
        # 1 - for unknown
        self.vocabulary_indices = self.create_vocabulary_indices()

        print('used vocabulary', self.vocabulary_indices)

        self.temp_onehot_x = []
        self.temp_onehot_y = []
        self.process_dataset(train_data, self.populate_onehot)
        train_onehot_x = np.array(self.temp_onehot_x)
        train_onehot_y = np.array(self.temp_onehot_y)
        print('example of onehot x', train_onehot_x[0])
        print('example of onehot y', train_onehot_y[0])

        self.temp_onehot_x = []
        self.temp_onehot_y = []
        self.process_dataset(test_data, self.populate_onehot)
        test_onehot_x = np.array(self.temp_onehot_x)
        test_onehot_y = np.array(self.temp_onehot_y)

        train_dataset = TensorDataset(torch.from_numpy(train_onehot_x), torch.from_numpy(train_onehot_y))
        test_dataset = TensorDataset(torch.from_numpy(test_onehot_x), torch.from_numpy(test_onehot_y))

        self.train_loader = DataLoader(train_dataset, shuffle=True, batch_size=self.batch_size)
        self.test_loader = DataLoader(test_dataset, shuffle=False, batch_size=self.batch_size)

        # add 2 more for padding and unknown
        vocabulary_size = len(self.vocabulary_indices) + 2

        model = LstmNetwork(vocabulary_size=vocabulary_size,
                            embedding_dimension=self.embedding_dimension,
                            hidden_dimension=self.hidden_dimension,
                            output_dimension=self.number_of_classes,
                            )

        self.model = model.to(self.device)

    def create_vocabulary_indices(self):
        sorted_vocabulary = sorted(self.all_vocabulary.items(), key=lambda x: x[1], reverse=True)

        if len(sorted_vocabulary) > self.vocabulary_max_size:
            sorted_vocabulary = sorted_vocabulary[:self.vocabulary_max_size]

        vocabulary_indices = {}

        for index, word_entry in enumerate(sorted_vocabulary):
            word = word_entry[0]
            vocabulary_indices[word] = index + 2

        return vocabulary_indices

    def pre_process(self, tokens, label):
        if label in self.all_labels:
            self.all_labels[label] = self.all_labels[label] + 1
        else:
            self.all_labels[label] = 1

        for word in tokens:
            if word in self.all_vocabulary:
                self.all_vocabulary[word] = self.all_vocabulary[word] + 1
            else:
                self.all_vocabulary[word] = 1

    def populate_onehot(self, tokens, label):
        entry = []
        for word in tokens:
            if word in self.vocabulary_indices:
                entry.append(self.vocabulary_indices[word])
            else:
                entry.append(1)

        if len(entry) < self.max_words_per_sample:
            entry = entry + [0] * (self.max_words_per_sample - len(entry))

        if len(entry) > self.max_words_per_sample:
            entry = entry[:self.max_words_per_sample]

        self.temp_onehot_x.append(entry)

        labels = [0.0] * self.number_of_classes
        labels[label] = 1.0
        self.temp_onehot_y.append(labels)

    @staticmethod
    def process_dataset(dataset, sample_processor):
        for sample in dataset:
            text = sample["text"]
            label = sample["label"]

            text = text.lower()
            text = text.replace("<br />", " \n ")
            words = nlp(text)
            tokens = []
            for word in words:
                tokens.append(word.text)
            sample_processor(tokens, label)

    def train(self):
        print('training...')
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

        start_time = time.time()

        for epoch in range(self.number_of_epochs):
            self.model.train()
            for inputs, labels in self.train_loader:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)

                logits = self.model(inputs)
                loss = torch.nn.functional.cross_entropy(logits, labels)
                optimizer.zero_grad()

                loss.backward()

                optimizer.step()

                batch_idx = 98765
                print(f'Epoch: {epoch + 1:03d}/{self.number_of_epochs:03d} | '
                      f'Batch {batch_idx:03d}/{len(self.train_loader):03d} | '
                      f'Loss: {loss:.4f}')

            with torch.set_grad_enabled(False):
                print(f'training accuracy: '
                      f'{self.compute_accuracy(self.train_loader):.2f}%'
                      f'\ntest accuracy: '
                      f'{self.compute_accuracy(self.test_loader):.2f}%')

            print(f'Time elapsed: {(time.time() - start_time) / 60:.2f} min')

        print(f'Total Training Time: {(time.time() - start_time) / 60:.2f} min')
        print(f'Test accuracy: {self.compute_accuracy(self.test_loader):.2f}%')

    def compute_accuracy(self, data_loader):
        with torch.no_grad():
            correct_pred, num_examples = 0, 0

            for i, (features, targets) in enumerate(data_loader):
                features = features.to(self.device)
                targets = targets.float().to(self.device)

                logits = self.model(features)
                _, predicted_labels = torch.max(logits, 1)
                _, target_class = torch.max(targets, 1)

                num_examples += targets.size(0)
                correct_pred += (predicted_labels == target_class).sum()
        return correct_pred.float() / num_examples * 100


def main():
    random_seed = 42
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(random_seed)

    trainer = Trainer()
    trainer.train()


main()

Full Blog TOC

Full Blog Table Of Content with Keywords Available HERE

Sunday, August 25, 2024

Using LSTM For IMDB Prediction

No comments:

Post a Comment