Tensorflow 开启训练后卡死

毕设做深度学习的课题,使用到了TensorFlow,但训练时出现了问题:

跑脚本开启训练之后,跑完不到100次就会卡死,然后显示python已停止工作

图片描述

这是我的训练的代码

# 导入数据集

import load_record

# 导入TensorFlow并创建Session

import tensorflow as tf

sess = tf.InteractiveSession()

# # 构建计算图

x = tf.placeholder("float", shape = [None, 224, 224, 1])

y_ = tf.placeholder("float", shape = [None, 2])

def weight_variable(shape):

initial = tf.truncated_normal(shape, stddev = 0.1)

return tf.Variable(initial)

def bias_variable(shape):

initial = tf.constant(0.1, shape = shape)

return tf.Variable(initial)

def conv2d(x, W):

return tf.nn.conv2d(x, W, strides = [1, 1, 1, 1], padding = 'SAME')

def max_pool_2x2(x):

return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

x_image = tf.reshape(x, [-1, 224, 224, 1])

# 第一层卷积

W_conv1 = weight_variable([5, 5, 1, 32])

b_conv1 = bias_variable([32])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

h_pool1 = max_pool_2x2(h_conv1)

# 第二层卷积

W_conv2 = weight_variable([5, 5, 32, 64])

b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

h_pool2= max_pool_2x2(h_conv2)

# 第一层全连接层

W_fc1 = weight_variable([56 * 56 * 64, 1024])

b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 56*56*64])

h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# Dropout层防止过拟合

keep_prob = tf.placeholder(tf.float32)

h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# 第二层全连接层输出

W_fc2 = weight_variable([1024, 2])

b_fc2 = bias_variable([2])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

# 训练模型

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))

accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

sess.run(tf.global_variables_initializer())

print('ready...')

for i in range(500):

images, labels = load_record.inputs(data_set = 'train', batch_size = 50, num_epochs = None)

tf.train.start_queue_runners(sess = sess)

_images, _labels = sess.run([images, labels])

if i % 100 == 0:

train_accuracy = accuracy.eval(feed_dict={

x:_images, y_: _labels, keep_prob: 1.0})

print("step %d, training accuracy %g"%(i, train_accuracy))

train_step.run(feed_dict={x: _images, y_: _labels, keep_prob: 0.5})

数据集我是用tfrecord读进去的

def inputs(data_set,batch_size,num_epochs):

if not num_epochs:

num_epochs = None

if data_set == 'train':

file = TRAIN_FILE

else:

file = VALIDATION_FILE

with tf.name_scope('input') as scope:

filename_queue = tf.train.string_input_producer([file], num_epochs=num_epochs)

image,label = read_and_decode(filename_queue)

#随机获得batch_size大小的图像和label

images,labels = tf.train.shuffle_batch([image, label],

batch_size=batch_size,

num_threads=64,

capacity=1000 + 3 * batch_size,

min_after_dequeue=1000

)

return images,labels

求教,如需补充描述请@


只迭代了第一次, 第二次就不行了

回答:

看上去是TenserFlow要求CPU具备AVX和AVX2特性,但你的CPU不支持.

AVX指令集是Sandy Bridge和Larrabee架构下的新指令集.
Sandy Bridge是英特2011年后出的CPU.

如果可以的话找台新机器试试你的代码.

Linux上用下面的代码可以检查是否CPU支持avx技术

 cat /proc/cpuinfo |grep avx

回答:

请问楼主解决了吗?谢谢

回答:

是tf.train.string_input_producer的原因
https://blog.csdn.net/lujiand...

回答:

请问解决了吗?怎么解决的?

以上是 Tensorflow 开启训练后卡死 的全部内容, 来源链接: utcz.com/a/160485.html

回到顶部