Tensorflow 开启训练后卡死
毕设做深度学习的课题,使用到了TensorFlow,但训练时出现了问题:
跑脚本开启训练之后,跑完不到100次就会卡死,然后显示python已停止工作
这是我的训练的代码
# 导入数据集import load_record
# 导入TensorFlow并创建Session
import tensorflow as tf
sess = tf.InteractiveSession()
# # 构建计算图
x = tf.placeholder("float", shape = [None, 224, 224, 1])
y_ = tf.placeholder("float", shape = [None, 2])
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape = shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides = [1, 1, 1, 1], padding = 'SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')
x_image = tf.reshape(x, [-1, 224, 224, 1])
# 第一层卷积
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
# 第二层卷积
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2= max_pool_2x2(h_conv2)
# 第一层全连接层
W_fc1 = weight_variable([56 * 56 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 56*56*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
# Dropout层防止过拟合
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# 第二层全连接层输出
W_fc2 = weight_variable([1024, 2])
b_fc2 = bias_variable([2])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
# 训练模型
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.global_variables_initializer())
print('ready...')
for i in range(500):
images, labels = load_record.inputs(data_set = 'train', batch_size = 50, num_epochs = None)
tf.train.start_queue_runners(sess = sess)
_images, _labels = sess.run([images, labels])
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={
x:_images, y_: _labels, keep_prob: 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
train_step.run(feed_dict={x: _images, y_: _labels, keep_prob: 0.5})
数据集我是用tfrecord读进去的
def inputs(data_set,batch_size,num_epochs): if not num_epochs:
num_epochs = None
if data_set == 'train':
file = TRAIN_FILE
else:
file = VALIDATION_FILE
with tf.name_scope('input') as scope:
filename_queue = tf.train.string_input_producer([file], num_epochs=num_epochs)
image,label = read_and_decode(filename_queue)
#随机获得batch_size大小的图像和label
images,labels = tf.train.shuffle_batch([image, label],
batch_size=batch_size,
num_threads=64,
capacity=1000 + 3 * batch_size,
min_after_dequeue=1000
)
return images,labels
求教,如需补充描述请@
只迭代了第一次, 第二次就不行了
回答:
看上去是TenserFlow要求CPU具备AVX和AVX2特性,但你的CPU不支持.
AVX指令集是Sandy Bridge和Larrabee架构下的新指令集.
Sandy Bridge是英特2011年后出的CPU.
如果可以的话找台新机器试试你的代码.
Linux上用下面的代码可以检查是否CPU支持avx技术
cat /proc/cpuinfo |grep avx
回答:
请问楼主解决了吗?谢谢
回答:
是tf.train.string_input_producer的原因
https://blog.csdn.net/lujiand...
回答:
请问解决了吗?怎么解决的?
以上是 Tensorflow 开启训练后卡死 的全部内容, 来源链接: utcz.com/a/160485.html