|
|
| (2 intermediate revisions by the same user not shown) |
| Line 1: |
Line 1: |
| =MNIST Convolutional Neural Network= | | =Simple MNIST Convolutional Network= |
|
| |
|
| Concept: Simple, end-to-end, LeNet-5-like convolutional MNIST model example. Meant as a tutorial for simple convolutional models.
| | ==Input Function== |
|
| |
|
| Link to code: https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
| | Define an input function. This has an internal function that parses the example data (one piece of data at a time) and one-hot encodes the labeled images with the digit it corresponds to. |
| | |
| Link to tutorial(s): https://www.tensorflow.org/tutorials/
| |
| | |
| Link to original data set: http://yann.lecun.com/exdb/mnist/
| |
| | |
| ==License==
| |
| | |
| <pre>
| |
| # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
| |
| #
| |
| # Licensed under the Apache License, Version 2.0 (the "License");
| |
| # you may not use this file except in compliance with the License.
| |
| # You may obtain a copy of the License at
| |
| #
| |
| # http://www.apache.org/licenses/LICENSE-2.0
| |
| #
| |
| # Unless required by applicable law or agreed to in writing, software
| |
| # distributed under the License is distributed on an "AS IS" BASIS,
| |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
| |
| # See the License for the specific language governing permissions and
| |
| # limitations under the License.
| |
| # ==============================================================================
| |
| </pre>
| |
| | |
| ==Import Statements and Variables==
| |
| | |
| Import statements:
| |
| | |
| <pre>
| |
| from __future__ import absolute_import
| |
| from __future__ import division
| |
| from __future__ import print_function
| |
| | |
| import argparse
| |
| import gzip
| |
| import os
| |
| import sys
| |
| import time
| |
| | |
| import numpy
| |
| from six.moves import urllib
| |
| from six.moves import xrange # pylint: disable=redefined-builtin
| |
| import tensorflow as tf
| |
| </pre>
| |
| | |
| Variable definitions for use in the rest of the model:
| |
| | |
| <pre>
| |
| SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
| |
| WORK_DIRECTORY = 'data'
| |
| IMAGE_SIZE = 28
| |
| NUM_CHANNELS = 1
| |
| PIXEL_DEPTH = 255
| |
| NUM_LABELS = 10
| |
| VALIDATION_SIZE = 5000 # Size of the validation set.
| |
| SEED = 66478 # Set to None for random seed.
| |
| BATCH_SIZE = 64
| |
| NUM_EPOCHS = 10
| |
| EVAL_BATCH_SIZE = 64
| |
| EVAL_FREQUENCY = 100 # Number of steps between evaluations.
| |
| FLAGS = None
| |
| </pre>
| |
| | |
| ==Obtaining the Data==
| |
| | |
| Several functions are defined to help obtain the data. First, define the variable types we will use in the model:
| |
| | |
| <pre>
| |
| def data_type():
| |
| """Return the type of the activations, weights, and placeholder variables."""
| |
| if FLAGS.use_fp16:
| |
| return tf.float16
| |
| else:
| |
| return tf.float32
| |
| </pre>
| |
| | |
| Now define a function that will attempt to download the data if it does not already exist on disk. This uses urllib to obtain the MNIST files, and TensorFlow's gfile module to interact with the file and filesystem.
| |
| | |
| <pre>
| |
| def maybe_download(filename):
| |
| """Download the data from Yann's website, unless it's already here."""
| |
| if not tf.gfile.Exists(WORK_DIRECTORY):
| |
| tf.gfile.MakeDirs(WORK_DIRECTORY)
| |
| filepath = os.path.join(WORK_DIRECTORY, filename)
| |
| if not tf.gfile.Exists(filepath):
| |
| filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
| |
| with tf.gfile.GFile(filepath) as f:
| |
| size = f.size()
| |
| print('Successfully downloaded', filename, size, 'bytes.')
| |
| return filepath
| |
| </pre>
| |
| | |
| Once the data is downloaded, it must be converted to a format convenient for Tensorflow - in particular, a 4D tensor in which the first index is the image number, the second and third are the width and height, and the fourth dimension is each channel of the image.
| |
| | |
| These values are then normalized and re-scaled.
| |
| | |
| <pre>
| |
| def extract_data(filename, num_images):
| |
| """Extract the images into a 4D tensor [image index, y, x, channels].
| |
| Values are rescaled from [0, 255] down to [-0.5, 0.5].
| |
| """
| |
| print('Extracting', filename)
| |
| with gzip.open(filename) as bytestream:
| |
| bytestream.read(16)
| |
| buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
| |
| data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
| |
| data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
| |
| data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
| |
| return data
| |
| </pre>
| |
| | |
| The labels - the predictions - must also be put into a format conducive for TensorFlow - a 1D vector:
| |
| | |
| <pre>
| |
| def extract_labels(filename, num_images):
| |
| """Extract the labels into a vector of int64 label IDs."""
| |
| print('Extracting', filename)
| |
| with gzip.open(filename) as bytestream:
| |
| bytestream.read(8)
| |
| buf = bytestream.read(1 * num_images)
| |
| labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
| |
| return labels
| |
| </pre>
| |
| | |
| There's also a utility for creating a fake data set.
| |
| | |
| ==Error Rate==
| |
| | |
| There is a function defined to compute the error rate. It computes the accuracy first: sums up the number of correctly-labeled digits, divides by the total number of digits, multiplies by 100 to convert to percent. Last, it subtracts the accuracy from 100 to get a percent error.
| |
|
| |
|
| <pre> | | <pre> |
| def error_rate(predictions, labels): | | def input_fn(mode, batch_size=1): |
| """Return the error rate based on dense predictions and sparse labels.""" | | """A simple input_fn using the contrib.data input pipeline.""" |
| return 100.0 - (
| |
| 100.0 *
| |
| numpy.sum(numpy.argmax(predictions, 1) == labels) /
| |
| predictions.shape[0])
| |
| </pre>
| |
|
| |
|
| Note that this metric is NOT used for training the convolutional network, it is only used for printing purposes.
| | def example_parser(serialized_example): |
| | | """Parses a single tf.Example into image and label tensors.""" |
| ==Main Method==
| | features = tf.parse_single_example( |
| | | serialized_example, |
| ===Get Data=== | | features={ |
| | | 'image_raw': tf.FixedLenFeature([], tf.string), |
| The main method starts by checking if it is in self-test mode (debug mode), in which case, it generates fake data:
| | 'label': tf.FixedLenFeature([], tf.int64), |
| | | }) |
| <pre>
| | image = tf.decode_raw(features['image_raw'], tf.uint8) |
| if FLAGS.self_test:
| | image.set_shape([28 * 28]) |
| print('Running self-test.')
| |
| train_data, train_labels = fake_data(256)
| |
| validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) | |
| test_data, test_labels = fake_data(EVAL_BATCH_SIZE) | |
| num_epochs = 1
| |
| </pre>
| |
|
| |
|
| Otherwise, it extracts data from the downloaded training/testing MNIST images. None of this is using TensorFlow functionality yet.
| | # Normalize the values of the image from the range [0, 255] to [-0.5, 0.5] |
| | image = tf.cast(image, tf.float32) / 255 - 0.5 |
| | label = tf.cast(features['label'], tf.int32) |
| | return image, tf.one_hot(label, 10) |
|
| |
|
| <pre>
| | if mode == tf.estimator.ModeKeys.TRAIN: |
| | tfrecords_file = os.path.join(FLAGS.data_dir, 'train.tfrecords') |
| else: | | else: |
| # Get the data. | | assert mode == tf.estimator.ModeKeys.EVAL, 'invalid mode' |
| train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
| | tfrecords_file = os.path.join(FLAGS.data_dir, 'test.tfrecords') |
| train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') | |
| test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
| |
| test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')
| |
|
| |
|
| # Extract it into numpy arrays.
| | assert tf.gfile.Exists(tfrecords_file), ( |
| train_data = extract_data(train_data_filename, 60000)
| | 'Run convert_to_records.py first to convert the MNIST data to TFRecord ' |
| train_labels = extract_labels(train_labels_filename, 60000)
| | 'file format.') |
| test_data = extract_data(test_data_filename, 10000)
| |
| test_labels = extract_labels(test_labels_filename, 10000)
| |
|
| |
|
| # Generate a validation set.
| | dataset = tf.contrib.data.TFRecordDataset([tfrecords_file]) |
| validation_data = train_data[:VALIDATION_SIZE, ...]
| |
| validation_labels = train_labels[:VALIDATION_SIZE]
| |
| train_data = train_data[VALIDATION_SIZE:, ...]
| |
| train_labels = train_labels[VALIDATION_SIZE:]
| |
| num_epochs = NUM_EPOCHS
| |
| train_size = train_labels.shape[0]
| |
| </pre>
| |
|
| |
|
| ===Variable Definitions=== | | # For training, repeat the dataset forever |
| | if mode == tf.estimator.ModeKeys.TRAIN: |
| | dataset = dataset.repeat() |
|
| |
|
| To use the input variables we have on the computational graph that TensorFlow will build, we have to declare those input variables. We do that using a TensorFlow placeholder type.
| | # Map example_parser over dataset, and batch results by up to batch_size |
| | dataset = dataset.map( |
| | example_parser, num_threads=1, output_buffer_size=batch_size) |
| | dataset = dataset.batch(batch_size) |
| | images, labels = dataset.make_one_shot_iterator().get_next() |
|
| |
|
| We also created a placeholder variable in the [[TensorFlow/Adversarial Crypto]] class, in the [[TensorFlow/Adversarial_Crypto#AdversarialCrypto_Class_-_Creation_of_Message_and_Key|Creation of Message and Key]] section. There, the placeholder variable was batch size. (However, the input variables in that network were different because we used that somewhat mysterious function, "tf.contrib.framework.arg_scope()", and passed it TensorFlow variables to initialize on each of the three graph we had.)
| | return images, labels |
| | |
| Here, we are explicitly creating a variable placeholder for the trianing data (inputs), training labels (outputs), and evaluation data. (Remember, the data_type() function was defined above and just returns a float16 or float32 type.)
| |
| | |
| Again, notice the four dimensions - meaning train_data_node is a placeholder representing a 4D tensor.
| |
| | |
| <pre>
| |
| # This is where training samples and labels are fed to the graph.
| |
| # These placeholder nodes will be fed a batch of training data at each
| |
| # training step using the {feed_dict} argument to the Run() call below.
| |
| train_data_node = tf.placeholder(
| |
| data_type(),
| |
| shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
| |
| train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
| |
| eval_data = tf.placeholder(
| |
| data_type(),
| |
| shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
| |
| </pre> | | </pre> |
|
| |
|
| Next, we create a set of tensors that will hold the weights and biases of the convolutional neural network layers. (When we use high-level neural net APIs like [[Keras]], this is all taken care of for us, but here we do it all manually.)
| | ==Prepare Model== |
| | |
| ====Variables for Convolutional Layers==== | |
| | |
| First, convolutional layer 1: in this layer we are defining a 5 x 5 filter - meaning the 28 x 28 pixel images in the NIST data set will be convoluted down to a 5 x 5 image set. But the number of feature maps is 32, meaning we're going to convolute the 28 x 28 pixel images down into 32 different 5 x 5 pixel images. The idea is that each of those feature maps will pick out a particular feature about the image topology that is important, and "represent" it, or "allow" that signal to pass through the first layer of the convolutional neural network.
| |
| | |
| The bias determines the importance of each of those feature maps, so we need the number of biases to match the number of feature maps (32).
| |
|
| |
|
| <pre> | | <pre> |
| # The variables below hold all the trainable weights. They are passed an | | def mnist_model(inputs, mode): |
| # initial value which will be assigned when we call: | | """Takes the MNIST inputs and mode and outputs a tensor of logits.""" |
| # {tf.global_variables_initializer().run()} | | # Input Layer |
| conv1_weights = tf.Variable(
| | # Reshape X to 4-D tensor: [batch_size, width, height, channels] |
| tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32.
| | # MNIST images are 28x28 pixels, and have one color channel |
| stddev=0.1,
| | inputs = tf.reshape(inputs, [-1, 28, 28, 1]) |
| seed=SEED, dtype=data_type()))
| | data_format = FLAGS.data_format |
| conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) | |
| </pre>
| |
|
| |
|
| We do the same thing with the second convolutional layer. This time there is no number of channels showing up in the size.
| | if data_format is None: |
| | # When running on GPU, transpose the data from channels_last (NHWC) to |
| | # channels_first (NCHW) to improve performance. |
| | # See https://www.tensorflow.org/performance/performance_guide#data_formats |
| | data_format = ('channels_first' if tf.test.is_built_with_cuda() else |
| | 'channels_last') |
|
| |
|
| <pre>
| | if data_format == 'channels_first': |
| conv2_weights = tf.Variable(tf.truncated_normal(
| | inputs = tf.transpose(inputs, [0, 3, 1, 2]) |
| [5, 5, 32, 64], stddev=0.1,
| |
| seed=SEED, dtype=data_type()))
| |
| conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
| |
| </pre> | | </pre> |
|
| |
|
| The truncated_normal variable types refer to how the variables are initialized - they are initialized with values drawn from a (truncated) normal distribution. Truncated simply means that they cannot go above 1 or below 0, which makes it easier for the machine learning algorithm.
| | ==Construct Model== |
| | |
| Link to truncated_normal documentation: https://www.tensorflow.org/api_docs/python/tf/truncated_normal
| |
| | |
| ====Variables for Fully Connected Layers==== | |
| | |
| Next come the two fully connected (FC) layers, which re-assemble the feature maps that were detected in the convolutional layers. The fully connected layer creates 64 neurons for every 4 x 4 square of pixels in the original image, with a depth of 512 neurons each.
| |
| | |
| (NOTE: This doesn't make sense. There's waaaaaay too many neurons here.)
| |
| | |
| (It may be that this is the number of inputs, then the number of outputs - as in, there are (size/4)*(size/4)*64 inputs on one side, 512 outputs on the other side.)
| |
|
| |
|
| <pre> | | <pre> |
| fc1_weights = tf.Variable( # fully connected, depth 512. | | # Convolutional Layer #1 |
| tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], | | # Computes 32 features using a 5x5 filter with ReLU activation. |
| stddev=0.1,
| | # Padding is added to preserve width and height. |
| seed=SEED,
| | # Input Tensor Shape: [batch_size, 28, 28, 1] |
| dtype=data_type()))
| | # Output Tensor Shape: [batch_size, 28, 28, 32] |
| </pre>
| | conv1 = tf.layers.conv2d( |
| | inputs=inputs, |
| | filters=32, |
| | kernel_size=[5, 5], |
| | padding='same', |
| | activation=tf.nn.relu, |
| | data_format=data_format) |
|
| |
|
| The second (last) fully connected layer is a 512-wide layer that has a depth equal to the number of labels - in this case, 10 corresponding to 10 digits.
| | # Pooling Layer #1 |
| | # First max pooling layer with a 2x2 filter and stride of 2 |
| | # Input Tensor Shape: [batch_size, 28, 28, 32] |
| | # Output Tensor Shape: [batch_size, 14, 14, 32] |
| | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2, |
| | data_format=data_format) |
|
| |
|
| (Again, this looks like 512 inputs connecting to NUM_LABELS outputs.) | | # Convolutional Layer #2 |
| | # Computes 64 features using a 5x5 filter. |
| | # Padding is added to preserve width and height. |
| | # Input Tensor Shape: [batch_size, 14, 14, 32] |
| | # Output Tensor Shape: [batch_size, 14, 14, 64] |
| | conv2 = tf.layers.conv2d( |
| | inputs=pool1, |
| | filters=64, |
| | kernel_size=[5, 5], |
| | padding='same', |
| | activation=tf.nn.relu, |
| | data_format=data_format) |
|
| |
|
| <pre>
| | # Pooling Layer #2 |
| fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], | | # Second max pooling layer with a 2x2 filter and stride of 2 |
| stddev=0.1,
| | # Input Tensor Shape: [batch_size, 14, 14, 64] |
| seed=SEED,
| | # Output Tensor Shape: [batch_size, 7, 7, 64] |
| dtype=data_type()))
| | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2, |
| fc2_biases = tf.Variable(tf.constant( | | data_format=data_format) |
| 0.1, shape=[NUM_LABELS], dtype=data_type()))
| |
| </pre>
| |
|
| |
|
| ===Neural Network Layers=== | | # Flatten tensor into a batch of vectors |
| | # Input Tensor Shape: [batch_size, 7, 7, 64] |
| | # Output Tensor Shape: [batch_size, 7 * 7 * 64] |
| | pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) |
|
| |
|
| The next step in the script is to use the variables defined above to construct the neural network layers themselves.
| | # Dense Layer |
| | # Densely connected layer with 1024 neurons |
| | # Input Tensor Shape: [batch_size, 7 * 7 * 64] |
| | # Output Tensor Shape: [batch_size, 1024] |
| | dense = tf.layers.dense(inputs=pool2_flat, units=1024, |
| | activation=tf.nn.relu) |
|
| |
|
| Starting with function header:
| | # Add dropout operation; 0.6 probability that element will be kept |
| | dropout = tf.layers.dropout( |
| | inputs=dense, rate=0.4, training=(mode == tf.estimator.ModeKeys.TRAIN)) |
|
| |
|
| <pre>
| | # Logits layer |
| def model(data, train=False): | | # Input Tensor Shape: [batch_size, 1024] |
| """The Model definition."""
| | # Output Tensor Shape: [batch_size, 10] |
| | logits = tf.layers.dense(inputs=dropout, units=10) |
| | return logits |
| </pre> | | </pre> |
|
| |
|
| We first construct the first convolutional layer of the network:
| | ==Get Estimator== |
|
| |
|
| <pre> | | <pre> |
| # 2D convolution, with 'SAME' padding (i.e. the output feature map has
| | def mnist_model_fn(features, labels, mode): |
| https://www.tensorflow.org/api_docs/python/tf/truncated_normal # the same size as the input). Note that {strides} is a 4D array whose
| | """Model function for MNIST.""" |
| # shape matches the data layout: [image index, y, x, depth].
| | logits = mnist_model(features, mode) |
| conv = tf.nn.conv2d(data,
| |
| conv1_weights,
| |
| strides=[1, 1, 1, 1],
| |
| padding='SAME')
| |
| </pre>
| |
|
| |
|
| We pass in the convolution layer 1 weights variable created above.
| | predictions = { |
| | 'classes': tf.argmax(input=logits, axis=1), |
| | 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') |
| | } |
|
| |
|
| After the convolution layer comes a rectifier layer:
| | if mode == tf.estimator.ModeKeys.PREDICT: |
| | return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) |
|
| |
|
| <pre>
| | loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits) |
| # Bias and rectified linear non-linearity.
| |
| relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
| |
| </pre>
| |
|
| |
|
| then a max pooling layer to apply the biases:
| | # Configure the training op |
| | | if mode == tf.estimator.ModeKeys.TRAIN: |
| <pre>
| | optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) |
| # Max pooling. The kernel size spec {ksize} also follows the layout of
| | train_op = optimizer.minimize(loss, tf.train.get_or_create_global_step()) |
| # the data. Here we have a pooling window of 2, and a stride of 2.
| | else: |
| pool = tf.nn.max_pool(relu,
| | train_op = None |
| ksize=[1, 2, 2, 1],
| |
| strides=[1, 2, 2, 1],
| |
| padding='SAME')
| |
| </pre>
| |
| | |
| Then comes the second convolutional layer:
| |
| | |
| <pre>
| |
| conv = tf.nn.conv2d(pool,
| |
| conv2_weights,
| |
| strides=[1, 1, 1, 1],
| |
| padding='SAME')
| |
| relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
| |
| pool = tf.nn.max_pool(relu,
| |
| ksize=[1, 2, 2, 1],
| |
| strides=[1, 2, 2, 1],
| |
| padding='SAME')
| |
| </pre>
| |
| | |
| Now the pool layer is a 4D tensor. We have to reshape it into a 2D matrix so that its shape matches the shape of the fully connected layer. Just take the 2nd, 3rd, and 4th dimensions and stretch them all out into a single dimension.
| |
| | |
| <pre>
| |
| # Reshape the feature map cuboid into a 2D matrix to feed it to the
| |
| # fully connected layers.
| |
| pool_shape = pool.get_shape().as_list()
| |
| reshape = tf.reshape(
| |
| pool,
| |
| [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
| |
| </pre>
| |
| | |
| The final step is the last fully connected layer, which will output 10 signals to 10 neurons, each corresponding to our 10 classes/digits.
| |
| | |
| <pre>
| |
| # Fully connected layer. Note that the '+' operation automatically
| |
| # broadcasts the biases.
| |
| hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
| |
| # Add a 50% dropout during training only. Dropout also scales
| |
| # activations such that no rescaling is needed at evaluation time.
| |
| if train:
| |
| hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
| |
| | |
| return tf.matmul(hidden, fc2_weights) + fc2_biases
| |
| </pre>
| |
| | |
| ===Why Separate Variable Definitions and Model Creation===
| |
| | |
| The reason this network separates the variable creation, which initializes TensorFlow variables representing weights and biases, and the model creation, which is contained in a function that assembles a neural net layer by layer, is because the model will be used in two different contexts: training, and prediction.
| |
| | |
| The model construction always uses the same variables, no matter what the model is going to be used for. By separating variable creation from model construction, we allow the training and evaluation steps to use the same network architecture and the same weights.
| |
| | |
| ===Training Model===
| |
| | |
| The training procedure starts by creating a model (calling the model() function):
| |
| | |
| <pre>
| |
| # Training computation: logits + cross-entropy loss.
| |
| logits = model(train_data_node, True)
| |
| </pre>
| |
| | |
| The loss function is defined as a function that seeks to minimize the cross-entropy:
| |
| | |
| <pre>
| |
| loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
| |
| labels=train_labels_node, logits=logits))
| |
| </pre>
| |
| | |
| L2 regulizers are used to ensure we do not get spurious fits of constants (i.e., it will minimize the number of constants needed to fit data):
| |
| | |
| <prE>
| |
| # L2 regularization for the fully connected parameters.
| |
| regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
| |
| tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
| |
| # Add the regularization term to the loss.
| |
| loss += 5e-4 * regularizers
| |
| </pre>
| |
| | |
| The L2 loss function is part of the neural network (nn) module in TensorFlow.
| |
| | |
| Link to nn module documentation: https://www.tensorflow.org/versions/master/api_docs/python/tf/nn
| |
| | |
| Last, the training schedule is set: an exponential decay schedule is used, wherein each round/batch will reduce the learning rate.
| |
| | |
| <pre>
| |
| # Optimizer: set up a variable that's incremented once per batch and
| |
| # controls the learning rate decay.
| |
| batch = tf.Variable(0, dtype=data_type())
| |
| # Decay once per epoch, using an exponential schedule starting at 0.01.
| |
| learning_rate = tf.train.exponential_decay(
| |
| 0.01, # Base learning rate.
| |
| batch * BATCH_SIZE, # Current index into the dataset.
| |
| train_size, # Decay step.
| |
| 0.95, # Decay rate.
| |
| staircase=True)
| |
| </pre>
| |
| | |
| The optimizer object is set, and it works by minimizing the loss function (the mean of the cross entropy function):
| |
| | |
| <pre>
| |
| # Use simple momentum for the optimization.
| |
| optimizer = tf.train.MomentumOptimizer(learning_rate,
| |
| 0.9).minimize(loss,
| |
| global_step=batch)
| |
| </pre>
| |
| | |
| Lastly, there is the prediction function, which finishes the last layer of the neural network with a softmax function:
| |
| | |
| <pre>
| |
| # Predictions for the current training minibatch.
| |
| train_prediction = tf.nn.softmax(logits)
| |
| </pre>
| |
| | |
| The logits variable, i.e., the input to the softmax function, is the model function, fed the training data.
| |
| | |
| ===Evaluation Model===
| |
| | |
| We define the same type of softmax function for the output of the prediction network:
| |
| | |
| <pre>
| |
| # Predictions for the test and validation, which we'll compute less often.
| |
| eval_prediction = tf.nn.softmax(model(eval_data))
| |
| </pre>
| |
| | |
| The input to the softmax function is the model function fed the evaluation data.
| |
| | |
| ===Function to Evaluate Data===
| |
| | |
| To evaluate the data set, pass in a set of data, and split it into batches:
| |
| | |
| <pre>
| |
| # Small utility function to evaluate a dataset by feeding batches of data to
| |
| # {eval_data} and pulling the results from {eval_predictions}.
| |
| # Saves memory and enables this to run on smaller GPUs.
| |
| def eval_in_batches(data, sess): | |
| """Get all predictions for a dataset by running it in small batches."""
| |
| size = data.shape[0]
| |
| if size < EVAL_BATCH_SIZE:
| |
| raise ValueError("batch size for evals larger than dataset: %d" % size)
| |
| predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
| |
| </pre>
| |
| | |
| These batches are then fed to the "evaluate prediction" function (eval_prediction):
| |
| | |
| <pre>
| |
| for begin in xrange(0, size, EVAL_BATCH_SIZE):
| |
| end = begin + EVAL_BATCH_SIZE
| |
| if end <= size:
| |
| predictions[begin:end, :] = sess.run(
| |
| eval_prediction,
| |
| feed_dict={eval_data: data[begin:end, ...]})
| |
| else:
| |
| batch_predictions = sess.run(
| |
| eval_prediction,
| |
| feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
| |
| predictions[begin:, :] = batch_predictions[begin - size:, :]
| |
| return predictions | |
| </pre>
| |
| | |
| ===Train the Model===
| |
| | |
| The last bit of code in this main method is the code that actually trains the model. We begin by starting a TensorFlow session:
| |
| | |
| <pre>
| |
| # Create a local session to run the training.
| |
| start_time = time.time()
| |
| with tf.Session() as sess:
| |
| </pre>
| |
|
| |
|
| The variables on the graph are initialized to their random values:
| | accuracy = tf.metrics.accuracy( |
| | tf.argmax(labels, axis=1), predictions['classes']) |
| | metrics = {'accuracy': accuracy} |
|
| |
|
| <pre>
| | # Create a tensor named train_accuracy for logging purposes |
| # Run all the initializers to prepare the trainable parameters.
| | tf.identity(accuracy[1], name='train_accuracy') |
| tf.global_variables_initializer().run()
| | tf.summary.scalar('train_accuracy', accuracy[1]) |
| print('Initialized!')
| |
| </pre>
| |
|
| |
|
| Now we have a loop over each overall iteration - the number of iterations is the number of epochs times the number of pieces of data per batch.
| | return tf.estimator.EstimatorSpec( |
| | | mode=mode, |
| <pre>
| | predictions=predictions, |
| # Loop through training steps.
| | loss=loss, |
| for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
| | train_op=train_op, |
| # Compute the offset of the current minibatch in the data. | | eval_metric_ops=metrics) |
| # Note that we could use better randomization across epochs. | |
| offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) | |
| batch_data = train_data[offset:(offset + BATCH_SIZE), ...] | |
| batch_labels = train_labels[offset:(offset + BATCH_SIZE)] | |
| </pre> | | </pre> |
|
| |
|
| (Note that there is a whole lot of manual shuffling of data going on here. While this is meant to be an illustrative example, it ends up being an unwieldy and confusing example that's quite difficult to extend to new cases.)
| | ==Main Function== |
|
| |
|
| <pre> | | <pre> |
| # This dictionary maps the batch data (as a numpy array) to the
| | def main(unused_argv): |
| # node in the graph it should be fed to.
| | # Create the Estimator |
| feed_dict = {train_data_node: batch_data,
| | mnist_classifier = tf.estimator.Estimator( |
| train_labels_node: batch_labels}
| | model_fn=mnist_model_fn, model_dir=FLAGS.model_dir) |
| # Run the optimizer to update weights.
| |
| sess.run(optimizer, feed_dict=feed_dict) | |
| </pre>
| |
|
| |
|
| The run() function is where the network is actually trained with a batch of data. We pass it an optimizer so that it will adjust the weights to minimize the loss function.
| | # Train the model |
| | tensors_to_log = { |
| | 'train_accuracy': 'train_accuracy' |
| | } |
|
| |
|
| Every few steps, we will evaluate the network by making predictions.
| | logging_hook = tf.train.LoggingTensorHook( |
| | tensors=tensors_to_log, every_n_iter=100) |
|
| |
|
| <pre>
| | batches_per_epoch = _NUM_IMAGES['train'] / FLAGS.batch_size |
| # print some extra information once reach the evaluation frequency
| |
| if step % EVAL_FREQUENCY == 0:
| |
| # fetch some extra nodes' data
| |
| l, lr, predictions = sess.run([loss, learning_rate, train_prediction],
| |
| feed_dict=feed_dict)
| |
| elapsed_time = time.time() - start_time
| |
| start_time = time.time()
| |
| print('Step %d (epoch %.2f), %.1f ms' %
| |
| (step, float(step) * BATCH_SIZE / train_size,
| |
| 1000 * elapsed_time / EVAL_FREQUENCY))
| |
| print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
| |
| print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
| |
| print('Validation error: %.1f%%' % error_rate(
| |
| eval_in_batches(validation_data, sess), validation_labels))
| |
| sys.stdout.flush()
| |
| </pre>
| |
|
| |
|
| Ugh, finally we get to the end, this example is like eating cardboard:
| | mnist_classifier.train( |
| | input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN, FLAGS.batch_size), |
| | steps=FLAGS.train_epochs * batches_per_epoch, |
| | hooks=[logging_hook]) |
|
| |
|
| <pre>
| | # Evaluate the model and print results |
| # Finally print the result!
| | eval_results = mnist_classifier.evaluate( |
| test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
| | input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL)) |
| print('Test error: %.1f%%' % test_error)
| | print() |
| | print('Evaluation results:\n %s' % eval_results) |
| </pre> | | </pre> |
|
| |
|
Simple MNIST Convolutional Network
Input Function
Define an input function. This has an internal function that parses the example data (one piece of data at a time) and one-hot encodes the labeled images with the digit it corresponds to.
def input_fn(mode, batch_size=1):
"""A simple input_fn using the contrib.data input pipeline."""
def example_parser(serialized_example):
"""Parses a single tf.Example into image and label tensors."""
features = tf.parse_single_example(
serialized_example,
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([28 * 28])
# Normalize the values of the image from the range [0, 255] to [-0.5, 0.5]
image = tf.cast(image, tf.float32) / 255 - 0.5
label = tf.cast(features['label'], tf.int32)
return image, tf.one_hot(label, 10)
if mode == tf.estimator.ModeKeys.TRAIN:
tfrecords_file = os.path.join(FLAGS.data_dir, 'train.tfrecords')
else:
assert mode == tf.estimator.ModeKeys.EVAL, 'invalid mode'
tfrecords_file = os.path.join(FLAGS.data_dir, 'test.tfrecords')
assert tf.gfile.Exists(tfrecords_file), (
'Run convert_to_records.py first to convert the MNIST data to TFRecord '
'file format.')
dataset = tf.contrib.data.TFRecordDataset([tfrecords_file])
# For training, repeat the dataset forever
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.repeat()
# Map example_parser over dataset, and batch results by up to batch_size
dataset = dataset.map(
example_parser, num_threads=1, output_buffer_size=batch_size)
dataset = dataset.batch(batch_size)
images, labels = dataset.make_one_shot_iterator().get_next()
return images, labels
Prepare Model
def mnist_model(inputs, mode):
"""Takes the MNIST inputs and mode and outputs a tensor of logits."""
# Input Layer
# Reshape X to 4-D tensor: [batch_size, width, height, channels]
# MNIST images are 28x28 pixels, and have one color channel
inputs = tf.reshape(inputs, [-1, 28, 28, 1])
data_format = FLAGS.data_format
if data_format is None:
# When running on GPU, transpose the data from channels_last (NHWC) to
# channels_first (NCHW) to improve performance.
# See https://www.tensorflow.org/performance/performance_guide#data_formats
data_format = ('channels_first' if tf.test.is_built_with_cuda() else
'channels_last')
if data_format == 'channels_first':
inputs = tf.transpose(inputs, [0, 3, 1, 2])
Construct Model
# Convolutional Layer #1
# Computes 32 features using a 5x5 filter with ReLU activation.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 28, 28, 1]
# Output Tensor Shape: [batch_size, 28, 28, 32]
conv1 = tf.layers.conv2d(
inputs=inputs,
filters=32,
kernel_size=[5, 5],
padding='same',
activation=tf.nn.relu,
data_format=data_format)
# Pooling Layer #1
# First max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 28, 28, 32]
# Output Tensor Shape: [batch_size, 14, 14, 32]
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2,
data_format=data_format)
# Convolutional Layer #2
# Computes 64 features using a 5x5 filter.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 14, 14, 32]
# Output Tensor Shape: [batch_size, 14, 14, 64]
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5, 5],
padding='same',
activation=tf.nn.relu,
data_format=data_format)
# Pooling Layer #2
# Second max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 14, 14, 64]
# Output Tensor Shape: [batch_size, 7, 7, 64]
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2,
data_format=data_format)
# Flatten tensor into a batch of vectors
# Input Tensor Shape: [batch_size, 7, 7, 64]
# Output Tensor Shape: [batch_size, 7 * 7 * 64]
pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
# Dense Layer
# Densely connected layer with 1024 neurons
# Input Tensor Shape: [batch_size, 7 * 7 * 64]
# Output Tensor Shape: [batch_size, 1024]
dense = tf.layers.dense(inputs=pool2_flat, units=1024,
activation=tf.nn.relu)
# Add dropout operation; 0.6 probability that element will be kept
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=(mode == tf.estimator.ModeKeys.TRAIN))
# Logits layer
# Input Tensor Shape: [batch_size, 1024]
# Output Tensor Shape: [batch_size, 10]
logits = tf.layers.dense(inputs=dropout, units=10)
return logits
Get Estimator
def mnist_model_fn(features, labels, mode):
"""Model function for MNIST."""
logits = mnist_model(features, mode)
predictions = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
# Configure the training op
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
train_op = optimizer.minimize(loss, tf.train.get_or_create_global_step())
else:
train_op = None
accuracy = tf.metrics.accuracy(
tf.argmax(labels, axis=1), predictions['classes'])
metrics = {'accuracy': accuracy}
# Create a tensor named train_accuracy for logging purposes
tf.identity(accuracy[1], name='train_accuracy')
tf.summary.scalar('train_accuracy', accuracy[1])
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
Main Function
def main(unused_argv):
# Create the Estimator
mnist_classifier = tf.estimator.Estimator(
model_fn=mnist_model_fn, model_dir=FLAGS.model_dir)
# Train the model
tensors_to_log = {
'train_accuracy': 'train_accuracy'
}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=100)
batches_per_epoch = _NUM_IMAGES['train'] / FLAGS.batch_size
mnist_classifier.train(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN, FLAGS.batch_size),
steps=FLAGS.train_epochs * batches_per_epoch,
hooks=[logging_hook])
# Evaluate the model and print results
eval_results = mnist_classifier.evaluate(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL))
print()
print('Evaluation results:\n %s' % eval_results)
Flags