Created
November 15, 2017 16:42
-
-
Save melgor/946b9643aa25dd3839a86804fc580741 to your computer and use it in GitHub Desktop.
alexnet_benchmark_fp16.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2015 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Timing benchmark for AlexNet inference. | |
To run, use: | |
bazel run -c opt --config=cuda \ | |
models/tutorials/image/alexnet:alexnet_benchmark | |
Across 100 steps on batch size = 128. | |
Forward pass: | |
Run on Tesla K40c: 145 +/- 1.5 ms / batch | |
Run on Titan X: 70 +/- 0.1 ms / batch | |
Forward-backward pass: | |
Run on Tesla K40c: 480 +/- 48 ms / batch | |
Run on Titan X: 244 +/- 30 ms / batch | |
""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import argparse | |
from datetime import datetime | |
import math | |
import sys | |
import time | |
from six.moves import xrange # pylint: disable=redefined-builtin | |
import tensorflow as tf | |
slim = tf.contrib.slim | |
from tensorflow.contrib.slim.nets import resnet_v1 | |
from tensorflow.contrib.slim.nets import resnet_utils | |
FLAGS = None | |
def float32_variable_storage_getter(getter, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, *args, **kwargs): | |
"""Custom variable getter that forces trainable variables to be stored in float32 precision and then casts them to the training precision. """ | |
storage_dtype = tf.float32 if trainable else dtype | |
variable = getter(name, shape, dtype=storage_dtype, initializer=initializer, regularizer=regularizer, trainable=trainable, *args, **kwargs) | |
if trainable and dtype != tf.float32: | |
variable = tf.cast(variable, dtype) | |
return variable | |
def gradients_with_loss_scaling(loss, variables, loss_scale): | |
"""Gradient calculation with loss scaling to improve numerical stability when training with float16. """ | |
return [grad / loss_scale for grad in tf.gradients(loss * loss_scale, variables)] | |
def print_activations(t): | |
print(t.op.name, ' ', t.get_shape().as_list()) | |
def inference(images, dtype): | |
"""Build the AlexNet model. | |
Args: | |
images: Images Tensor | |
Returns: | |
pool5: the last Tensor in the convolutional component of AlexNet. | |
parameters: a list of Tensors corresponding to the weights and biases of the | |
AlexNet model. | |
""" | |
parameters = [] | |
# conv1 | |
with tf.name_scope('conv1') as scope: | |
kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=dtype, | |
stddev=1e-1), name='weights') | |
conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME') | |
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=dtype), | |
trainable=True, name='biases') | |
bias = tf.nn.bias_add(conv, biases) | |
conv1 = tf.nn.relu(bias, name=scope) | |
print_activations(conv1) | |
parameters += [kernel, biases] | |
## lrn1 | |
#with tf.name_scope('lrn1') as scope: | |
#lrn1 = tf.nn.local_response_normalization(conv1, | |
#alpha=1e-4, | |
#beta=0.75, | |
#depth_radius=2, | |
#bias=2.0) | |
# pool1 | |
pool1 = tf.nn.max_pool(conv1, | |
ksize=[1, 3, 3, 1], | |
strides=[1, 2, 2, 1], | |
padding='VALID', | |
name='pool1') | |
print_activations(pool1) | |
# conv2 | |
with tf.name_scope('conv2') as scope: | |
kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=dtype, | |
stddev=1e-1), name='weights') | |
conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME') | |
biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=dtype), | |
trainable=True, name='biases') | |
bias = tf.nn.bias_add(conv, biases) | |
conv2 = tf.nn.relu(bias, name=scope) | |
parameters += [kernel, biases] | |
print_activations(conv2) | |
## lrn2 | |
#with tf.name_scope('lrn2') as scope: | |
#lrn2 = tf.nn.local_response_normalization(conv2, | |
#alpha=1e-4, | |
#beta=0.75, | |
#depth_radius=2, | |
#bias=2.0) | |
# pool2 | |
pool2 = tf.nn.max_pool(conv2, | |
ksize=[1, 3, 3, 1], | |
strides=[1, 2, 2, 1], | |
padding='VALID', | |
name='pool2') | |
print_activations(pool2) | |
# conv3 | |
with tf.name_scope('conv3') as scope: | |
kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384], | |
dtype=dtype, | |
stddev=1e-1), name='weights') | |
conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME') | |
biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=dtype), | |
trainable=True, name='biases') | |
bias = tf.nn.bias_add(conv, biases) | |
conv3 = tf.nn.relu(bias, name=scope) | |
parameters += [kernel, biases] | |
print_activations(conv3) | |
# conv4 | |
with tf.name_scope('conv4') as scope: | |
kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256], | |
dtype=dtype, | |
stddev=1e-1), name='weights') | |
conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME') | |
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype), | |
trainable=True, name='biases') | |
bias = tf.nn.bias_add(conv, biases) | |
conv4 = tf.nn.relu(bias, name=scope) | |
parameters += [kernel, biases] | |
print_activations(conv4) | |
# conv5 | |
with tf.name_scope('conv5') as scope: | |
kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256], | |
dtype=dtype, | |
stddev=1e-1), name='weights') | |
conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME') | |
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype), | |
trainable=True, name='biases') | |
bias = tf.nn.bias_add(conv, biases) | |
conv5 = tf.nn.relu(bias, name=scope) | |
parameters += [kernel, biases] | |
print_activations(conv5) | |
# pool5 | |
pool5 = tf.nn.max_pool(conv5, | |
ksize=[1, 3, 3, 1], | |
strides=[1, 2, 2, 1], | |
padding='VALID', | |
name='pool5') | |
print_activations(pool5) | |
return pool5, parameters | |
def time_tensorflow_run(session, target, info_string): | |
"""Run the computation to obtain the target tensor and print timing stats. | |
Args: | |
session: the TensorFlow session to run the computation under. | |
target: the target Tensor that is passed to the session's run() function. | |
info_string: a string summarizing this run, to be printed with the stats. | |
Returns: | |
None | |
""" | |
num_steps_burn_in = 10 | |
total_duration = 0.0 | |
total_duration_squared = 0.0 | |
for i in xrange(FLAGS.num_batches + num_steps_burn_in): | |
start_time = time.time() | |
_ = session.run(target) | |
duration = time.time() - start_time | |
if i >= num_steps_burn_in: | |
if not i % 10: | |
print ('%s: step %d, duration = %.3f' % | |
(datetime.now(), i - num_steps_burn_in, duration)) | |
total_duration += duration | |
total_duration_squared += duration * duration | |
mn = total_duration / FLAGS.num_batches | |
vr = total_duration_squared / FLAGS.num_batches - mn * mn | |
sd = math.sqrt(vr) | |
print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % | |
(datetime.now(), info_string, FLAGS.num_batches, mn, sd)) | |
def run_benchmark(): | |
"""Run the benchmark on AlexNet.""" | |
dtype = tf.float16 | |
with tf.device('/gpu:0'), tf.variable_scope('fp32_storage', custom_getter=float32_variable_storage_getter): | |
# Generate some dummy images. | |
image_size = 224 | |
# Note that our padding definition is slightly different the cuda-convnet. | |
# In order to force the model to start with the same activations sizes, | |
# we add 3 to the image_size and employ VALID padding above. | |
images = tf.Variable(tf.random_normal([FLAGS.batch_size, | |
image_size, | |
image_size, 3], | |
dtype=dtype, | |
stddev=1e-1)) | |
# Build a Graph that computes the logits predictions from the | |
# inference model. | |
pool5, parameters = inference(images, dtype) | |
# Build an initialization operation. | |
init = tf.global_variables_initializer() | |
# Start running operations on the Graph. | |
config = tf.ConfigProto() | |
config.gpu_options.allocator_type = 'BFC' | |
sess = tf.Session(config=config) | |
sess.run(init) | |
# Run the forward benchmark. | |
time_tensorflow_run(sess, pool5, "Forward") | |
# Add a simple objective so we can calculate the backward pass. | |
objective = tf.nn.l2_loss(pool5) | |
# Compute the gradient with respect to all the parameters. | |
grad = tf.gradients(objective, parameters) | |
# Run the backward benchmark. | |
time_tensorflow_run(sess, grad, "Forward-backward") | |
def main(_): | |
run_benchmark() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--batch_size', | |
type=int, | |
default=128, | |
help='Batch size.' | |
) | |
parser.add_argument( | |
'--num_batches', | |
type=int, | |
default=100, | |
help='Number of batches to run.' | |
) | |
FLAGS, unparsed = parser.parse_known_args() | |
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment