Skip to content
Snippets Groups Projects
Commit c2403b2b authored by jackkolm's avatar jackkolm
Browse files

finished model code

parent d7a37177
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
def load_data():
file_path = 'C:\\repos\\text-mining\\project\\all_games.csv'
df = pd.read_csv(file_path)
return df
def split_data(df):
"""
Split data into two parts; a training dataset, and a test dataset.
Returns the two parts as a tuple.c
"""
test_data = pd.DataFrame() # Initialize an empty DataFrame for test data
drop_indexes = []
min_review_score = 1
max_review_score = 100
for i in range(min_review_score, max_review_score+1):
row = df.loc[df['meta_score'] == i]
# Check if the row is an empty DataFrame
if row.empty:
continue
if row.isnull().values.any():
print("null")
# Select only the first row for the current meta_score
first_row = row.iloc[0]
drop_indexes.append(first_row.name)
# Add the first row to test_data
test_data = pd.concat([test_data, pd.DataFrame([first_row])])
# HERE I want to delete the row that's been added to the test data from the original data
df = df.drop(drop_indexes).copy()
# Shuffle the remaining data for training
df = df.sample(frac=1.0, random_state=200)
training_data = df
return training_data, test_data
def prep_data(df):
print(f"Size before prep: {len(df)}")
df = df.dropna(subset=["summary"])
print(f"Size after first drop: {len(df)}")
training, test = split_data(df)
print(f"Size of training: {len(training)}\nSize of test: {len(test)}")
train_X = np.array(training["summary"])
train_Y = np.array(training["meta_score"])
test_X = np.array(test["summary"])
test_Y = np.array(test["meta_score"])
return train_X, train_Y, test_X, test_Y
def load_and_prep_data():
df = load_data()
return prep_data(df)
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from load_data import load_and_prep_data
# for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# load data
train_X, train_Y, test_X, test_Y = load_and_prep_data()
# params
max_sequence_length = 180
embedding_dim = 6000
RNN = True # should, for report, always be true
# fetching tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
X_train_sequences = tokenizer.batch_encode_plus(
list(train_X), padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors="pt"
)['input_ids']
X_test_sequences = tokenizer.batch_encode_plus(
list(test_X), padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors="pt"
)['input_ids']
max_vocab_size = tokenizer.vocab_size
def pad_sequences(sequences, maxlen, padding='post', truncating='pre'):
padded = torch.zeros((len(sequences), maxlen), dtype=torch.long)
for i, seq in enumerate(sequences):
if truncating == 'pre':
seq = seq[-maxlen:]
else:
seq = seq[:maxlen]
if padding == 'post':
padded[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
else:
padded[i, -len(seq):] = torch.tensor(seq, dtype=torch.long)
return padded
X_train = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='pre').to(device)
X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='pre').to(device)
y_train = torch.tensor(train_Y, dtype=torch.float32).unsqueeze(1).to(device)
y_test = torch.tensor(test_Y, dtype=torch.float32).unsqueeze(1).to(device)
X_train_tensor = torch.clamp(X_train, max=max_vocab_size - 1)
X_test_tensor = torch.clamp(X_test, max=max_vocab_size - 1)
dataset_train = TensorDataset(X_train_tensor, y_train)
dataset_test = TensorDataset(X_test_tensor, y_test)
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)
#class CNNModel(nn.Module):
# def __init__(self, vocab_size, embedding_dim, output_dim):
# super(CNNModel, self).__init__()
# self.embedding = nn.Embedding(vocab_size, embedding_dim)
# self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=5)
# self.pool = nn.AdaptiveMaxPool1d(1)
# self.fc1 = nn.Linear(64, 32)
# self.dropout = nn.Dropout(0.5)
# self.fc2 = nn.Linear(32, output_dim)
#
# def forward(self, x):
# x = self.embedding(x).permute(0, 2, 1)
# x = F.relu(self.conv1(x))
# x = self.pool(x).squeeze(2)
# x = F.relu(self.fc1(x))
# x = self.dropout(x)
# return self.fc2(x)
#
#class LSTMModelAlt(nn.Module):
# def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
# super(LSTMModel, self).__init__()
# self.embedding = nn.Embedding(vocab_size, embedding_dim)
# self.lstm = nn.LSTM(128, hidden_dim, batch_first=True)
# self.fc1 = nn.Linear(hidden_dim, 16)
# #self.fc2 = nn.Linear(16, hidden_dim)
#
# self.dropout = nn.Dropout(0.5)
# self.fc3 = nn.Linear(16, output_dim)
#
# def forward(self, x):
# x = self.embedding(x)
# x, _ = self.lstm(x)
# x = x[:, -1, :]
# x = F.relu(self.fc1(x))
# x = self.dropout(x)
#
# return self.fc3(x)
class LSTMModel(nn.Module):
def __init__(self, vocab_size):
super(LSTMModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, 128)
self.lstm = nn.LSTM(128, 32, batch_first=True)
self.fc1 = nn.Linear(32, 16)
self.dropout = nn.Dropout(0.5)
self.fc2 = nn.Linear(16, 1)
def forward(self, x):
x = self.embedding(x)
x, _ = self.lstm(x)
x = x[:, -1, :]
x = F.relu(self.fc1(x))
x = self.dropout(x)
return self.fc2(x)
model = LSTMModel(max_vocab_size)
model.to(device) # to gpu
# training time
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
def plot_losses(train_losses, epochs):
# Plot the training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, marker='o', label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.grid()
plt.show()
def training():
# training
losses = []
epochs = 24
for epoch in range(epochs):
batch_losses = []
model.train()
for batch_X, batch_Y in dataloader_train:
batch_X, batch_Y = batch_X.to(device), batch_Y.to(device) # Move data to GPU
optimizer.zero_grad()
output = model(batch_X)
loss = criterion(output, batch_Y)
loss.backward()
optimizer.step()
batch_losses.append(loss.item())
avg_loss = sum(batch_losses) / len(batch_losses)
print(f"Epoch {epoch}: Loss = {avg_loss}")
losses.append(avg_loss)
# eval
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
for batch_X, batch_Y in dataloader_test:
#batch_X, batch_Y = batch
batch_X, batch_Y = batch_X.to(device), batch_Y.to(device)
predictions = model(batch_X)
y_true.extend(batch_Y.cpu().numpy())
y_pred.extend(predictions.cpu().numpy())
mae_value = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f"MAE: {mae_value}, MSE: {mse}, R2: {r2}")
plot_losses(losses, epochs)
training()
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Conv2D, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
import datetime
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from load_data import load_and_prep_data
import matplotlib.pyplot as plt
RUN_NETWORK = True
RNN = True
# prep data and parameters
train_X, train_Y, test_X, test_Y = load_and_prep_data()
max_vocab_size = 6000
max_sequence_length = 180
embedding_dim = 6000
# custom tokenizer
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(train_X)
tokenizer.fit_on_texts(test_X)
X_train_sequences = tokenizer.texts_to_sequences(train_X)
X_test_sequences = tokenizer.texts_to_sequences(test_X)
X_train = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='pre')
X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='pre')
y_train = train_Y
y_test = test_Y
# build models --- only RNN is used in report
model = Sequential()
all_metrics = ['mean_absolute_error', 'mean_squared_error', 'r2_score']
if RUN_NETWORK:
if RNN:
model.add(Embedding(input_dim=max_vocab_size, output_dim=128))
model.add(LSTM(32, return_sequences=False))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))
model.compile(
loss="mean_squared_error", #'mean_squared_error',
optimizer=Adam(learning_rate=0.001),
metrics=all_metrics
)
history = model.fit(
X_train,
y_train,
epochs=24,
batch_size=32,
validation_data=(X_test, y_test)
)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss per Epoch')
plt.show()
else:
# CNN network --- works, but left out
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))
model.compile(
loss='mean_squared_error',
optimizer=Adam(learning_rate=0.001),
metrics=all_metrics
)
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test))
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
from load_data import load_and_prep_data
from transformers import AutoTokenizer
from transformers import AutoModel
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
class ReviewDataset(Dataset):
def __init__(self, text_summaries, review_scores, tokenizer):
self.text_summaries = text_summaries
self.review_scores = review_scores
self.tokenizer = tokenizer
self.tokenized_summaries = []
self.normalized_scores = []
def __len__(self):
return len(self.text_summaries)
def __getitem__(self, idx):
tokens = self.tokenizer(self.text_summaries[idx], padding='max_length', truncation=True, return_tensors="pt")
label = self.review_scores[idx] / 100.0 # normalize the label (review score)
return tokens, label
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class TransformerModel(nn.Transformer):
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
self.model_type = 'Transformer'
self.src_mask = None
self.pos_encoder = PositionalEncoding(ninp, dropout)
self.input_emb = nn.Embedding(ntoken, ninp)
self.ninp = ninp
self.decoder = nn.Linear(ninp, 1)
self.init_weights()
def _generate_square_subsequent_mask(self, sz):
return torch.log(torch.tril(torch.ones(sz,sz)))
def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.input_emb.weight, -initrange, initrange)
nn.init.zeros_(self.decoder.bias)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)
def forward(self, input_ids, attention_mask):
embeddings = self.input_emb(input_ids) * math.sqrt(self.ninp)
embeddings = self.pos_encoder(embeddings)
output = self.encoder(embeddings, src_key_padding_mask=~attention_mask.bool().transpose(0, 1))
output = self.decoder(output.mean(dim=1)) # for regression
return output
def plot_losses(train_losses, epochs):
# Plot the training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, marker='o', label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.grid()
plt.show()
text_summaries, review_scores, test_text_summaries, test_review_scores = load_and_prep_data()
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # bert tokenizer again
tokenizer_vocab = tokenizer.get_vocab()
# tried filtering vocab to see if it would increase speed
top_n = 5000
filtered_vocab = dict(sorted(tokenizer_vocab.items(), key=lambda item: item[1])[:top_n])
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', vocab=filtered_vocab)
tokens_ = tokenizer(list(text_summaries), padding='max_length', truncation=True, return_tensors="pt")
normalized_labels = review_scores / 100.0
dataset = ReviewDataset(text_summaries, review_scores, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, pin_memory=True)
num_epochs = 24
ntoken = tokenizer.vocab_size
ninp = 512
nhead = 8
nhid = 4
nlayers = 2
dropout = 0.5
device = torch.device("cuda")
model = TransformerModel(ntoken, ninp, nhead, nhid, nlayers, dropout)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
model = model.to(device)
test_dataset = ReviewDataset(test_text_summaries, test_review_scores, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)
all_losses = []
for epoch in range(num_epochs):
batch_losses = []
for batch in dataloader:
tokens, labels = batch
labels = labels.float().to(device)
input_ids = tokens['input_ids'].squeeze(1)
attention_mask = tokens['attention_mask'].squeeze(1)
predictions = model(input_ids.to(device), attention_mask.to(device))
loss = loss_fn(predictions, labels.unsqueeze(-1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
batch_losses.append(loss.item())
avg_loss = sum(batch_losses) / len(batch_losses)
all_losses.append(avg_loss)
print("------------------------")
print(f"Epoch {epoch}: Loss = {avg_loss}")
# eval
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
for batch in test_dataloader:
tokens, labels = batch
labels = labels.float().to(device)
input_ids = tokens['input_ids'].squeeze(1).to(device)
attention_mask = tokens['attention_mask'].squeeze(1).to(device)
predictions = model(input_ids, attention_mask)
all_predictions.extend(predictions.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
all_predictions = [pred[0] for pred in all_predictions]
mae = mean_absolute_error(all_labels, all_predictions)
r2 = r2_score(all_labels, all_predictions)
mse = mean_squared_error(all_labels, all_predictions)
print(f"Mean Absolute Error: {mae * 100}")
print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse * 100}")
plot_losses(all_losses, 24)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment