tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 CNN

网友投稿 608 2022-11-15

tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 CNN

tensorflow 分布式 数据并行 异步训练 between-graph 自己写的实例 CNN

# 通用的数据并行还是推荐between-graph的,因为in-graph要自己归并cost之类的# 10.100.203.75这台运行#python test_dis2.py --job_name=worker --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=1#python test_dis2.py --job_name=ps --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=0# 10.100.206.209这台运行#python test_dis2.py --job_name=worker --ps_hosts=10.100.203.75:1111 --worker_hosts=10.100.206.209:2222,10.100.203.75:2223 --task_id=0# 注意task_id和在哪台机器上启动--job_name=worker脚本的--worker_hosts顺序对应# 感觉最好应该先运行ps再运行2个worker,最后是在两个worker都有打印训练过程import tensorflow as tfFLAGS = tf.app.flags.FLAGStf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"')tf.app.flags.DEFINE_string('ps_hosts', '', """Comma-separated list of hostname:port for the """ """parameter server jobs. e.g. """ """'machine1:2222,machine2:1111,machine2:2222'""")tf.app.flags.DEFINE_string('worker_hosts', '', """Comma-separated list of hostname:port for the """ """worker jobs. e.g. """ """'machine1:2222,machine2:1111,machine2:2222'""")tf.app.flags.DEFINE_integer( 'task_id', 0, 'Task id of the replica running the training.')ps_hosts = FLAGS.ps_hosts.split(',')worker_hosts = FLAGS.worker_hosts.split(',')cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,'worker': worker_hosts})server = tf.train.Server( {'ps': ps_hosts,'worker': worker_hosts}, job_name=FLAGS.job_name, task_index=FLAGS.task_id)print("!!!!")if FLAGS.job_name == 'ps': server.join()print("!!!!") from tensorflow.examples.tutorials.mnist import input_datamnist = input_data.read_data_sets("./", one_hot=True)#MNIST的四个.gz文件learning_rate = 0.001training_iters = 200000batch_size = 128display_step = 10n_input = 784n_classes = 10dropout = 0.75def conv2d(x, W, b, strides=1): x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) return tf.nn.relu(x)def maxpool2d(x, k=2): return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')def conv_net(x, weights, biases, dropout): x = tf.reshape(x, shape=[-1, 28, 28, 1]) conv1 = conv2d(x, weights['wc1'], biases['bc1']) # Max Pooling (down-sampling) conv1 = maxpool2d(conv1, k=2) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) # Fully connected layer # Reshape conv2 output to fit fully connected layer input fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) fc1 = tf.nn.relu(fc1) fc1 = tf.nn.dropout(fc1, dropout) out = tf.add(tf.matmul(fc1, weights['out']), biases['out']) return outwith tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_id, cluster=cluster_spec)): x = tf.placeholder(tf.float32, [None, n_input]) y = tf.placeholder(tf.float32, [None, n_classes]) keep_prob = tf.placeholder(tf.float32) #dropout (keep probability) weights = { # 5x5 conv, 1 input, 32 outputs 'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])), # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])), # fully connected, 7*7*64 inputs, 1024 outputs 'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])), # 1024 inputs, 10 outputs (class prediction) 'out': tf.Variable(tf.random_normal([1024, n_classes])) } biases = { 'bc1': tf.Variable(tf.random_normal([32])), 'bc2': tf.Variable(tf.random_normal([64])), 'bd1': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([n_classes])) } pred = conv_net(x, weights, biases, keep_prob) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) global_step = tf.Variable(0, name='global_step', trainable=False) init = tf.global_variables_initializer() saver = tf.train.Saver() tf.scalar_summary('cost', cost) summary_op = tf.merge_all_summaries()sv = tf.train.Supervisor(is_chief=(FLAGS.task_id == 0), logdir="C:\Users\guotong1\Desktop\checkpoint", init_op=init, summary_op=None, saver=saver, global_step=global_step, save_model_secs=60)# Launch the graph#相当于两个worker都启动session,server.target就是那台机器的localhostwith sv.managed_session(server.target) as sess: sess.run(init) step = 1 while step * batch_size < training_iters: batch_x, batch_y = mnist.train.next_batch(batch_size) sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout}) if step % display_step == 0: loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y, keep_prob: 1.}) print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \ "{:.6f}".format(loss) + ", Training Accuracy= " + \ "{:.5f}".format(acc)) step += 1 print("Optimization Finished!") # Calculate accuracy for 256 mnist test images print("Testing Accuracy:", \ sess.run(accuracy, feed_dict={x: mnist.test.images[:256], y: mnist.test.labels[:256], keep_prob: 1.}))sv.stop()

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:tensor transpose之后的含义探究
下一篇:5个步骤让你明白多线程和线程安全
相关文章

 发表评论

暂时没有评论,来抢沙发吧~