diff --git a/labs/l1/NLP-L1.ipynb b/labs/l1/NLP-L1.ipynb index d10e8aebf59149b6044f22ac57b51c8dddb1c0a3..f04e0a6bb313cb0cb501cd0e8d626bb1dbfff4aa 100644 --- a/labs/l1/NLP-L1.ipynb +++ b/labs/l1/NLP-L1.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,8 @@ " self.max_sentences = max_sentences\n", " \n", " def __iter__(self):\n", - " with bz2.open('simplewiki.txt.bz2', 'rt') as sentences:\n", + " # Changed location to get it to work for myself locally.\n", + " with bz2.open('C:\\\\Users\\\\epii\\\\Documents\\\\simplewiki.txt.bz2', 'rt', encoding = 'cp850') as sentences:\n", " for i, sentence in enumerate(sentences):\n", " if self.max_sentences and i >= self.max_sentences:\n", " break\n", @@ -60,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -80,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -99,11 +100,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "170815\n" + ] + } + ], "source": [ - "print(sum(1 for t in tokens(full_dataset)))" + "print(sum(1 for t in tokens(mini_dataset)))" ] }, { @@ -131,15 +140,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "def make_vocab_and_counts(sentences, min_count=5):\n", - " # TODO: Replace the next line with your own code\n", - " return {}, np.array(())" + " # dictt will contain all words and their index in word_freq\n", + " dictt = {}\n", + " #word_freq contains the #Occurences for each non-dropped work\n", + " word_freq = np.array([])\n", + " i = 0\n", + "\n", + " for sentence in sentences:\n", + " for token in sentence:\n", + " #We are cheeky and store the #Occurences as negatives, so later differe between #Occurences and indices\n", + " if token in dictt:\n", + " dictt[token] -= 1\n", + " else:\n", + " dictt[token] = -1\n", + " \n", + " # Go through everything again to add words to freq.\n", + " # Cant really combine stuff thanks to the cutoff\n", + " for sentence in sentences:\n", + " for token in sentence:\n", + " if token in dictt and dictt[token] <= -min_count:\n", + " word_freq=np.append(word_freq,[-dictt[token]])\n", + " dictt[token] = i\n", + " i += 1\n", + " elif token in dictt and dictt[token] > -min_count and dictt[token] < 0:\n", + " dictt.pop(token, 0)\n", + " \n", + " return dictt, word_freq" ] }, { @@ -164,11 +197,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3231\n", + "155818.0\n", + "73339\n", + "17297355.0\n" + ] + } + ], "source": [ - "# TODO: Test your code here" + "cuda = False\n", + "\n", + "\n", + "min_vocab, min_counts = make_vocab_and_counts(mini_dataset)\n", + "print(len(min_counts))\n", + "print(sum(min_counts))\n", + "full_vocab, full_counts = make_vocab_and_counts(full_dataset)\n", + "print(len(full_counts))\n", + "print(sum(full_counts))" ] }, { @@ -207,13 +259,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ + "#preprocess\n", + "import random\n", "def preprocess(vocab, counts, sentences, threshold=0.001):\n", - " # TODO: Replace the next line with your own code\n", - " return iter(())" + " N = sum(counts)\n", + " for sentence in sentences:\n", + " new_sentence = np.array([]) #Not sure this should be an npArray or a normal one\n", + " for token in sentence:\n", + " # Cut token if not part or with random chance\n", + " if token in vocab and random.random() > 1-(threshold*N / counts[vocab[token]])**0.5 :\n", + " new_sentence = np.append(new_sentence, vocab[token])\n", + " if len(new_sentence) > 0:\n", + " yield new_sentence" ] }, { @@ -240,11 +301,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# TODO: Test your code here" + "# Something seems slightly off here as we keep getting ~65% instead of ~59%\n", + "#summ=0\n", + "#for i in preprocess(min_vocab, min_counts, mini_dataset, threshold=0.001):\n", + "# summ+=len(i)\n", + "#print(summ/155818)\n", + "#summ=0\n", + "#for i in preprocess(full_vocab, full_counts, full_dataset, threshold=0.001):\n", + "# summ+=len(i)\n", + "#print(summ/17297355)" ] }, { @@ -330,13 +399,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", "def training_examples(vocab, counts, sentences, window=5, num_ns=5, batch_size=1<<19, ns_exponent=0.75):\n", - " # TODO: Replace the next line with your own code\n", - " yield torch.zeros(batch_size).long(), torch.zeros(batch_size, 1 + num_ns).long()" + " # Set up tensors\n", + " target = torch.zeros(batch_size).long()\n", + " context = torch.zeros(batch_size, 1 + num_ns).long()\n", + " \n", + " exp_count = torch.from_numpy(counts)\n", + " exp_count **= ns_exponent\n", + " # Basis for our probability of negative samples\n", + " com_sums = torch.cumsum(exp_count,dim=0)\n", + " max_sum = com_sums[-1]\n", + " \n", + " # Pos keeps track of how many rows we have added so far\n", + " pos = 0\n", + " for sentence in sentences:\n", + " for k in range(0,len(sentence)):\n", + " # Our window\n", + " wind = random.randint(0,window) \n", + " # Seems weird to start this at 0, but I guess we are giving it a random chance to drop even more\n", + " # And this is what matches the expected output\n", + " for i in range(max(-k,-wind),min(len(sentence)-k,wind+1)):\n", + " if i != 0:\n", + " # Might be able to tensor this, but it would not save much\n", + " target[pos] = torch.tensor([sentence[k]]).long()\n", + " context[pos,0] = torch.tensor([sentence[k+i]]).long()\n", + " pos += 1\n", + " if pos == batch_size:\n", + " # Negative sampling\n", + " context[:,1:] = torch.searchsorted(com_sums, max_sum*torch.rand(batch_size, num_ns))\n", + " if cuda:\n", + " target = target.to(torch.device('cuda'))\n", + " context = context.to(torch.device('cuda'))\n", + " \n", + " yield target, context\n", + " \n", + " #Reset\n", + " pos = 0\n", + " target = torch.zeros(batch_size).long()\n", + " context = torch.zeros(batch_size, 1 + num_ns).long()\n", + " \n", + " if pos > 0:\n", + " #Negative sampling\n", + " if cuda:\n", + " target = target.to(torch.device('cuda'))\n", + " context = context.to(torch.device('cuda'))\n", + " context[:pos,1:] = torch.searchsorted(com_sums, max_sum*torch.rand(pos, num_ns))\n", + " \n", + " #Cut trailing 0s\n", + " yield target[:pos], context[:pos,:]" ] }, { @@ -361,18 +478,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# TODO: Test your code here" + "summ = 0\n", + "#min_preProc = preprocess(min_vocab, min_counts, mini_dataset, threshold=0.001)\n", + "#for target, context in training_examples(min_vocab, min_counts, min_preProc, window=5, num_ns=5, batch_size=10000, ns_exponent=0.75):\n", + "# summ += len(target)\n", + "#print(summ)\n", + "#print(summ/155818)\n", + "\n", + "#summ = 0\n", + "#preProc = preprocess(full_vocab, full_counts, full_dataset, threshold=0.001)\n", + "#for target, context in training_examples(full_vocab, full_counts, preProc, window=5, num_ns=5, batch_size=1<<19, ns_exponent=0.75):\n", + "# summ += len(target)\n", + "# #print(len(target))\n", + "# #print(summ)\n", + "#print(summ)\n", + "#print(summ/17297355)\n", + "#Seems to end up at ~3.8 once again slightly too large." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Problem 4: Implement the model" + " Problem 4: Implement the model" ] }, { @@ -384,14 +516,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", - "\n", "class SGNSModel(nn.Module):\n", " \n", " def __init__(self, vocab, embedding_dim):\n", @@ -399,10 +530,20 @@ " self.vocab = vocab\n", " self.w = nn.Embedding(len(vocab), embedding_dim)\n", " self.c = nn.Embedding(len(vocab), embedding_dim)\n", + " if cuda:\n", + " self.w = self.w.to(torch.device('cuda'))\n", + " self.c = self.c.to(torch.device('cuda'))\n", " \n", " def forward(self, w, c):\n", - " # TODO: Replace the next line with your own code\n", - " return torch.zeros_like(c, dtype=torch.float, requires_grad=True)" + " D = torch.zeros(c.size(0), c.size(1)).float()\n", + " if cuda:\n", + " D = D.to(torch.device('cuda'))\n", + " _c = self.c(c)\n", + " _w = self.w(w)\n", + " for j in range(0, _c.size(1)):\n", + " D[:,j] = torch.sum(_w*_c[:,j], dim=1)\n", + " return D\n", + " " ] }, { @@ -429,16 +570,63 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: Test your code here" + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([-1.1112, 0.1935, -0.5805], grad_fn=<EmbeddingBackward0>)\n", + "tensor([-0.6418, 2.0867, -0.7420], grad_fn=<EmbeddingBackward0>)\n", + "1.5477675199508667\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([[ 1.5478, -0.0641],\n", + " [-0.5439, -2.2840],\n", + " [-0.3896, 8.4895],\n", + " [-4.6775, -3.6780],\n", + " [-0.2199, -3.3541]], grad_fn=<CopySlices>)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: Test your code here\n", + "import numpy as np\n", + "\n", + "\n", + "def random_example(vocab, counts, sentences):\n", + " skip = np.random.randint(100)\n", + " for i, example in enumerate(training_examples(vocab, counts, sentences, num_ns=1, batch_size=5)):\n", + " if i >= skip:\n", + " break\n", + " return example\n", + "\n", + "min_preProc = preprocess(min_vocab, min_counts, mini_dataset, threshold=0.001)\n", + "min_tar, min_cont = random_example(min_vocab,min_counts,min_preProc)\n", + "min_mod = SGNSModel(min_vocab, 3)\n", + "t1 = min_mod.w(min_tar[0])\n", + "t2 = min_mod.c(min_cont[0,0])\n", + "print(t1)\n", + "print(t2)\n", + "s = 0\n", + "for i in range(3):\n", + " s += t1[i] * t2[i]\n", + "print(s.item())\n", + "min_mod.forward(min_tar, min_cont)\n", + "#Seems to be working correctly" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -479,13 +667,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", + "from datetime import datetime\n", "\n", "def train(sentences, embedding_dim=50, window=5, num_ns=5, batch_size=1<<19, n_epochs=1, lr=1e-1):\n", " # Create the vocabulary and the counts\n", @@ -499,6 +688,39 @@ " \n", " # TODO: Add your code here\n", " \n", + " o_labs = torch.zeros(batch_size, 1 + num_ns).float()\n", + " o_labs[:,0] = torch.ones(batch_size).float()\n", + " if cuda:\n", + " o_labs = o_labs.to(torch.device('cuda'))\n", + " \n", + " # We train for several epochs\n", + " for t in range(n_epochs):\n", + " print(t)\n", + " preProc = preprocess(vocab, counts, sentences, threshold=0.001)\n", + " # In each epoch, we loop over all the minibatches\n", + " for tar, cont in training_examples(vocab, counts, preProc, window, num_ns, batch_size, ns_exponent=0.75):\n", + " # Reset the accumulated gradients\n", + " optimizer.zero_grad()\n", + " # Forward pass\n", + " ts1 = datetime.timestamp(datetime.now())\n", + " #print(\"prefor\", ts1)\n", + " output = model.forward(tar,cont)\n", + " ts2 = datetime.timestamp(datetime.now())\n", + " print(\"forward took: \", ts2-ts1)\n", + " \n", + " # Compute the loss\n", + " #The resize here is needed for the last batch, as it's smaller\n", + " loss = F.binary_cross_entropy_with_logits(output, o_labs[:output.size(0),:output.size(1)])\n", + " print(\"Loss: \",loss.item())\n", + " # Backward pass; propagates the loss and computes the gradients\n", + " ts3 = datetime.timestamp(datetime.now())\n", + " #print(\"preback\", ts3)\n", + " loss.backward()\n", + " ts4 = datetime.timestamp(datetime.now())\n", + " print(\"back took: \", ts4-ts3)\n", + " # Update the parameters of the model\n", + " optimizer.step()\n", + " \n", " return model" ] }, @@ -511,11 +733,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "model = train(mini_dataset, n_epochs=1)" + "cuda = True\n", + "#model = train(mini_dataset, embedding_dim=15, n_epochs=1, batch_size = 100000)" ] }, { @@ -533,11 +756,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# TODO: Train your model on the full dataset here" + "cuda = True\n", + "# TODO: Train your model on the full dataset here\n", + "#model = train(full_dataset, n_epochs=10)" ] }, { @@ -558,23 +783,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ] + } + ], "source": [ "def save_model(model):\n", " # Extract the embedding vectors as a NumPy array\n", - " embeddings = model.w.weight.detach().numpy()\n", + " if cuda:\n", + " embeddings = model.w.weight.detach().to(torch.device('cpu')).numpy()\n", + " else:\n", + " embeddings = model.w.weight.detach().numpy()\n", " \n", " # Create the word–vector pairs\n", " items = sorted((i, w) for w, i in model.vocab.items())\n", " items = [(w, e) for (i, w), e in zip(items, embeddings)]\n", " \n", " # Write the embeddings and the word labels to files\n", - " with open('vectors.tsv', 'wt') as fp1, open('metadata.tsv', 'wt') as fp2:\n", + " with open('vectors.tsv', 'wt', encoding = 'cp850') as fp1, open('metadata.tsv', 'wt', encoding = 'cp850') as fp2:\n", " for w, e in items:\n", " print('\\t'.join('{:.5f}'.format(x) for x in e), file=fp1)\n", - " print(w, file=fp2)" + " print(w, file=fp2)\n", + " \n", + "cuda = True \n", + "save_model(train(full_dataset, n_epochs=3))" ] }, { @@ -601,6 +839,27 @@ "source": [ "👠Well done!" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -619,7 +878,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.9.0" } }, "nbformat": 4, diff --git a/labs/l1/Our_Work/NLP-L1.ipynb b/labs/l1/Our_Work/NLP-L1.ipynb index 09a319211ab42d891ced0c96319f89bb7f229e9e..234940f441b192e677d04d4a71c7e54b3ad741ea 100644 --- a/labs/l1/Our_Work/NLP-L1.ipynb +++ b/labs/l1/Our_Work/NLP-L1.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,8 @@ " self.max_sentences = max_sentences\n", " \n", " def __iter__(self):\n", - " with bz2.open('simplewiki.txt.bz2', 'rt') as sentences:\n", + " # Changed location to get it to work for myself locally.\n", + " with bz2.open('C:\\\\Users\\\\epii\\\\Documents\\\\simplewiki.txt.bz2', 'rt', encoding = 'cp850') as sentences:\n", " for i, sentence in enumerate(sentences):\n", " if self.max_sentences and i >= self.max_sentences:\n", " break\n", @@ -60,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -80,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -99,19 +100,19 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "17594885\n" + "170815\n" ] } ], "source": [ - "print(sum(1 for t in tokens(full_dataset)))" + "print(sum(1 for t in tokens(mini_dataset)))" ] }, { @@ -139,29 +140,33 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "def make_vocab_and_counts(sentences, min_count=5):\n", + " # dictt will contain all words and their index in word_freq\n", " dictt = {}\n", + " #word_freq contains the #Occurences for each non-dropped work\n", " word_freq = np.array([])\n", " i = 0\n", "\n", " for sentence in sentences:\n", " for token in sentence:\n", + " #We are cheeky and store the #Occurences as negatives, so later differe between #Occurences and indices\n", " if token in dictt:\n", " dictt[token] -= 1\n", " else:\n", " dictt[token] = -1\n", - "\n", + " \n", + " # Go through everything again to add words to freq.\n", + " # Cant really combine stuff thanks to the cutoff\n", " for sentence in sentences:\n", " for token in sentence:\n", " if token in dictt and dictt[token] <= -min_count:\n", " word_freq=np.append(word_freq,[-dictt[token]])\n", - " #print(token)\n", " dictt[token] = i\n", " i += 1\n", " elif token in dictt and dictt[token] > -min_count and dictt[token] < 0:\n", @@ -192,23 +197,23 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "73339" + "3231" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "_, count = make_vocab_and_counts(full_dataset)\n", - "len(count)" + "vocab, counts = make_vocab_and_counts(mini_dataset)\n", + "len(counts)" ] }, { @@ -247,17 +252,18 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#preprocess\n", "import random\n", "def preprocess(vocab, counts, sentences, threshold=0.001):\n", - " N = np.sum(np.fromiter(counts,dtype=np.dtype((int,1))))\n", + " N = np.sum(np.fromiter(counts,dtype=np.dtype(int)))\n", " for sentence in sentences:\n", " new_sentence = np.array([]) #Not sure this should be an npArray or a normal one\n", " for token in sentence:\n", + " # Cut token if not part or with random chance\n", " if token in vocab and random.random() > 1-threshold*N / counts[vocab[token]] :\n", " new_sentence = np.append(new_sentence, vocab[token])\n", " if len(new_sentence) > 0:\n", @@ -288,35 +294,24 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1725370/2744212062.py:4: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " print(np.sum(np.fromiter(counts,dtype=np.dtype((int,1)))))\n", - "/tmp/ipykernel_1725370/2744212062.py:5: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " N = np.sum(np.fromiter(counts,dtype=np.dtype((int,1))))\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "155818\n", - "85873\n" + "0.5510595694977474\n" ] } ], "source": [ - "#test\n", - "vocab, counts = make_vocab_and_counts(mini_dataset)\n", + "# Something seems slightly off here as we keep getting ~55% instead of ~59%\n", + "#vocab, count = make_vocab_and_counts(mini_dataset)\n", "summ=0\n", "for i in preprocess(vocab, counts, mini_dataset, threshold=0.001):\n", " summ+=len(i)\n", - "print(summ)" + "print(summ/155818)" ] }, { @@ -402,18 +397,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def training_examples(vocab, counts, sentences, window=5, num_ns=5, batch_size=1<<19, ns_exponent=0.75):\n", - " # TODO: Replace the next line with your own code\n", - " yield torch.zeros(batch_size).long(), torch.zeros(batch_size, 1 + num_ns).long()" - ] - }, - { - "cell_type": "code", - "execution_count": 80, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -421,31 +405,45 @@ "import torch.nn as nn\n", "import torch.nn.functional as F\n", "def training_examples(vocab, counts, sentences, window=5, num_ns=5, batch_size=1<<19, ns_exponent=0.75):\n", + " # Set up tensors\n", " target = torch.zeros(batch_size).long()\n", " context = torch.zeros(batch_size, 1 + num_ns).long()\n", - " exp_count = []\n", - " for i in counts:\n", - " exp_count.append(i**ns_exponent)\n", - " com_sums = torch.cumsum(torch.from_numpy(np.array(exp_count)),dim=0)\n", + " \n", + " exp_count = torch.from_numpy(counts)\n", + " exp_count **= ns_exponent\n", + " # Basis for our probability of negative samples\n", + " com_sums = torch.cumsum(exp_count,dim=0)\n", " max_sum = com_sums[-1]\n", + " \n", + " # Pos keeps track of how many rows we have added so far\n", " pos = 0\n", " for sentence in sentences:\n", " for k in range(0,len(sentence)):\n", " wind = random.randint(1,window)\n", - " for i in range(-wind,wind):\n", - " if i != 0 and k+1 >= 0 and k+i < len(sentence):\n", - " target.index_copy_(0,torch.tensor([pos]),torch.tensor([sentence[k]]).long())\n", - " #TODO Readd negative, and fix last batch\n", - " #target[pos] = sentence[k]\n", - " #context[pos][0] = sentence[k+i]\n", - " #for u in range(0, num_ns):\n", - " # context[pos][u+1] = torch.searchsorted(random.random()*max_sum)\n", + " #Our window\n", + " for i in range(max(-k,-wind),min(len(sentence)-k,wind+1)):\n", + " if i != 0:\n", + " # Might be able to tensor this, but it would not save much\n", + " target[pos] = torch.tensor([sentence[k]]).long()\n", + " context[pos,0] = torch.tensor([sentence[k+i]]).long()\n", " pos += 1\n", " if pos == batch_size:\n", + " #Negative sampling\n", + " context[:,1:] = torch.searchsorted(com_sums, max_sum*torch.rand(batch_size, num_ns))\n", + " \n", " yield target, context\n", + " \n", + " #Reset\n", " pos = 0\n", " target = torch.zeros(batch_size).long()\n", - " context = torch.zeros(batch_size, 1 + num_ns).long()" + " context = torch.zeros(batch_size, 1 + num_ns).long()\n", + " \n", + " if pos > 0:\n", + " #Negative sampling\n", + " context[:pos,1:] = torch.searchsorted(com_sums, max_sum*torch.rand(pos, num_ns))\n", + " \n", + " #Cut trailing 0s\n", + " yield target[:pos], context[:pos,:]" ] }, { @@ -459,13 +457,6 @@ "> Reads from an iterable of *sentences* (lists of string tokens), preprocesses them using the function implemented in Problem 2, and then yields pairs of input batches for gradient-based training, represented as described above. Each batch contains *batch_size* positive examples. The parameter *window* specifies the maximal distance between a target word and a context word in a positive example; the actual window size around any given target word is sampled uniformly at random. The parameter *num_ns* specifies the number of negative samples per positive sample. The parameter *ns_exponent* specifies the exponent in the negative sampling (called $\\alpha$ above)." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -477,31 +468,24 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1725370/3470889482.py:4: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " N = np.sum(np.fromiter(counts,dtype=np.dtype((int,1))))\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "384000\n" + "384189\n" ] } ], "source": [ + "# Again something is slightly off as we seem to be getting slightly more than what we would expect\n", "summ = 0\n", "\n", - "vocab, counts = make_vocab_and_counts(mini_dataset)\n", + "#vocab, counts = make_vocab_and_counts(mini_dataset)\n", "preProc = preprocess(vocab, counts, mini_dataset, threshold=0.001)\n", - "for target, context in training_examples(vocab, counts, preProc, window=5, num_ns=5, batch_size=1000, ns_exponent=0.75):\n", + "for target, context in training_examples(vocab, counts, preProc, window=5, num_ns=5, batch_size=10000, ns_exponent=0.75):\n", " summ += len(target)\n", "print(summ)" ] @@ -522,14 +506,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", - "\n", "class SGNSModel(nn.Module):\n", " \n", " def __init__(self, vocab, embedding_dim):\n", @@ -539,8 +522,15 @@ " self.c = nn.Embedding(len(vocab), embedding_dim)\n", " \n", " def forward(self, w, c):\n", - " # TODO: Replace the next line with your own code\n", - " return torch.zeros_like(c, dtype=torch.float, requires_grad=True)" + " D = torch.zeros(c.size(0), c.size(1)).float()\n", + " _c = self.c(c)\n", + " _w = self.w(w)\n", + " #Feels dumb to do this elementwise\n", + " for j in range(0, _c.size(1)):\n", + " for i, x, y in zip(range(_c.size(0)), _w, _c[:,j]):\n", + " D[i,j] = torch.dot(x,y)\n", + " return D\n", + " " ] }, { @@ -567,16 +557,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-1.9941, -0.5351],\n", + " [-0.4325, -1.0734],\n", + " [-0.8902, -0.1611],\n", + " [-0.0079, 0.1741],\n", + " [ 0.2078, -0.6174]], grad_fn=<CopySlices>)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# TODO: Test your code here" + "# TODO: Test your code here\n", + "import numpy as np\n", + "\n", + "\n", + "def random_example(vocab, counts, sentences):\n", + " skip = np.random.randint(100)\n", + " for i, example in enumerate(training_examples(vocab, counts, sentences, num_ns=1, batch_size=5)):\n", + " if i >= skip:\n", + " break\n", + " return example\n", + "\n", + "vocab, counts = make_vocab_and_counts(mini_dataset)\n", + "preProc = preprocess(vocab, counts, mini_dataset, threshold=0.001)\n", + "tar, cont = random_example(vocab,counts,preProc)\n", + "mod = SGNSModel(vocab, 3)\n", + "mod.forward(tar, cont)\n", + "#I have no idea if this is the correct output" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -617,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -636,6 +657,30 @@ " optimizer = optim.Adam(model.parameters(), lr=lr)\n", " \n", " # TODO: Add your code here\n", + " preProc = preprocess(vocab, counts, sentences, threshold=0.001)\n", + " \n", + " o_labs = torch.zeros(batch_size, 1 + num_ns).float()\n", + " o_labs[:,0] = torch.ones(batch_size).float()\n", + " \n", + " print(\"Time to do Stuff\")\n", + " # We train for several epochs\n", + " for t in range(n_epochs):\n", + " print(t)\n", + " # In each epoch, we loop over all the minibatches\n", + " for tar, cont in training_examples(vocab, counts, preProc, window, num_ns, batch_size, ns_exponent=0.75):\n", + " # Reset the accumulated gradients\n", + " optimizer.zero_grad()\n", + " # Forward pass\n", + " output = model.forward(tar,cont)\n", + " # Compute the loss\n", + " #The resize here is needed for the last batch, as it's smaller\n", + " loss = F.binary_cross_entropy_with_logits(output, o_labs[:output.size(0),:output.size(1)])\n", + " print(loss)\n", + " # Backward pass; propagates the loss and computes the gradients\n", + " # This seems to be the slow step?\n", + " loss.backward()\n", + " # Update the parameters of the model\n", + " optimizer.step()\n", " \n", " return model" ] @@ -651,9 +696,50 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to do Stuff\n", + "0\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.8481, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.6987, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.4863, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.3485, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.2820, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n", + "PostForward\n", + "tensor(2.1518, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)\n", + "Step\n", + "Zero\n", + "PreForward\n" + ] + } + ], "source": [ - "model = train(mini_dataset, n_epochs=1)" + "model = train(mini_dataset, n_epochs=1, batch_size = 10000)" ] }, { @@ -757,7 +843,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.0" } }, "nbformat": 4,