# Using a bi-lstm to sort a sequence of integers
import random
import string
18    "\n",
import mxnet as mx
from mxnet import gluon, nd
import numpy as np
## Data Preparation
max_num = 999
dataset_size = 60000
seq_len = 5
split = 0.8
batch_size = 512
ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
We are getting a dataset of **dataset_size** sequences of integers of length **seq_len** between **0** and **max_num**. We use **split*100%** of them for training and the rest for testing.


For example:

50 10 200 999 30

Should return

10 30 50 200 999
50    "\n",
51    "\n",
52    "For example:\n",
53    "\n",
54    "50 10 200 999 30\n",
55    "\n",
56    "Should return\n",
57    "\n",
58    "10 30 50 200 999"
X = mx.random.uniform(low=0, high=max_num, shape=(dataset_size, seq_len)).astype('int32').asnumpy()
Y = X.copy()
Y.sort() #Let's sort X to get the target
75   "metadata": {},
Input [548, 592, 714, 843, 602]
Target [548, 592, 602, 714, 843]
print("Input {}\nTarget {}".format(X[0].tolist(), Y[0].tolist()))
For the purpose of training, we encode the input as characters rather than numbers
96  },
97  {
98   "cell_type": "code",
99   "execution_count": 5,
100   "metadata": {},
101   "outputs": [
102    {
103     "name": "stdout",
104     "output_type": "stream",
105     "text": [
0123456789
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, ' ': 10}
108     ]
109    }
110   ],
111   "source": [
vocab = string.digits + " "
print(vocab)
vocab_idx = { c:i for i,c in enumerate(vocab)}
print(vocab_idx)
122    "We write a transform that will convert our numbers into text of maximum length **max_len**, and one-hot encode the characters.\n",
123    "For example:\n",
124    "\n",
125    "\"30 10\" corresponding indices are [3, 0, 10, 1, 0]\n",
126    "\n",
127    "We then one hot encode that and get a matrix representation of our input. We don't need to encode our target as the loss we are going to use support sparse labels"
135    {
136     "name": "stdout",
137     "output_type": "stream",
138     "text": [
Maximum length of the string: 19
140     ]
141    }
max_len = len(str(max_num))*seq_len+(seq_len-1)
print("Maximum length of the string: %s" % max_len)
146   ]
154    "def transform(x, y):\n",
155    "    x_string = ' '.join(map(str, x.tolist()))\n",
156    "    x_string_padded = x_string + ' '*(max_len-len(x_string))\n",
157    "    x = [vocab_idx[c] for c in x_string_padded]\n",
158    "    y_string = ' '.join(map(str, y.tolist()))\n",
159    "    y_string_padded = y_string + ' '*(max_len-len(y_string))\n",
160    "    y = [vocab_idx[c] for c in y_string_padded]\n",
161    "    return mx.nd.one_hot(mx.nd.array(x), len(vocab)), mx.nd.array(y)"
split_idx = int(split*len(X))
train_dataset = gluon.data.ArrayDataset(X[:split_idx], Y[:split_idx]).transform(transform)
test_dataset = gluon.data.ArrayDataset(X[split_idx:], Y[split_idx:]).transform(transform)
180    {
Input [548 592 714 843 602]
Transformed data Input
186      "[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
187      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
Target [548 592 602 714 843]
Transformed data Target
[ 5.  4.  8. 10.  5.  9.  2. 10.  6.  0.  2. 10.  7.  1.  4. 10.  8.  4.
  3.]
print("Input {}".format(X[0]))
print("Transformed data Input {}".format(train_dataset[0][0]))
print("Target {}".format(Y[0]))
print("Transformed data Target {}".format(train_dataset[0][1]))
train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=20, last_batch='rollover')
test_data = gluon.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=5, last_batch='rollover')
## Creating the network
237  },
244    "net = gluon.nn.HybridSequential()\n",
246    "    net.add(\n",
247    "        gluon.rnn.LSTM(hidden_size=128, num_layers=2, layout='NTC', bidirectional=True),\n",
248    "        gluon.nn.Dense(len(vocab), flatten=False)\n",
249    "    )"
net.initialize(mx.init.Xavier(), ctx=ctx)
loss = gluon.loss.SoftmaxCELoss()
We use a learning rate schedule to improve the convergence of the model
276  },
schedule = mx.lr_scheduler.FactorScheduler(step=len(train_data)*10, factor=0.75)
schedule.base_lr = 0.01
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate':0.01, 'lr_scheduler':schedule})
## Training loop
302  },
310     "output_type": "stream",
312      "Epoch [0] Loss: 1.6627886372227823, LR 0.01\n",
416    "epochs = 100\n",
417    "for e in range(epochs):\n",
418    "    epoch_loss = 0.\n",
419    "    for i, (data, label) in enumerate(train_data):\n",
420    "        data = data.as_in_context(ctx)\n",
421    "        label = label.as_in_context(ctx)\n",
422    "\n",
423    "        with mx.autograd.record():\n",
424    "            output = net(data)\n",
425    "            l = loss(output, label)\n",
426    "\n",
427    "        l.backward()\n",
428    "        trainer.step(data.shape[0])\n",
429    "    \n",
430    "        epoch_loss += l.mean()\n",
431    "        \n",
432    "    print(\"Epoch [{}] Loss: {}, LR {}\".format(e, epoch_loss.asscalar()/(i+1), trainer.learning_rate))"
441  },
448  },
457    "x_orig = X[split_idx+n]\n",
458    "y_orig = Y[split_idx+n]"
469    "    output = net(x.as_in_context(ctx).expand_dims(axis=0))\n",
470    "\n",
471    "    # Convert output back to string\n",
472    "    pred = ''.join([vocab[int(o)] for o in output[0].argmax(axis=1).asnumpy().tolist()])\n",
473    "    return pred"
482  },
492      "X         611 671 275 871 944\n",
493      "Predicted 275 611 671 871 944\n",
494      "Label     275 611 671 871 944\n"
502   ]
511  {
520      "10 30 130 500 999  \n"
528  {
535  {
544      "Only four numbers: 105 202 302 501    \n"
552  {
559  {
568      "Small digits: 8  0 42 28         \n",
577  },
