1.在master节点上,首先启动集群
[root@master hadoop]# ls
bin etc include lib libexec LICENSE.txt logs NOTICE.txt README.txt sbin share tmp
[root@master hadoop]# cd etc
[root@master etc]# ls
hadoop
[root@master etc]# cd hadoop
[root@master hadoop]# ls
capacity-scheduler.xml hadoop-policy.xml kms-log4j.properties slaves
configuration.xsl hdfs-site.xml kms-site.xml ssl-client.xml.example
container-executor.cfg httpfs-env.sh log4j.properties ssl-server.xml.example
core-site.xml httpfs-log4j.properties mapred-env.cmd yarn-env.cmd
hadoop-env.cmd httpfs-signature.secret mapred-env.sh yarn-env.sh
hadoop-env.sh httpfs-site.xml mapred-queues.xml.template yarn-site.xml
hadoop-metrics2.properties kms-acls.xml mapred-site.xml
hadoop-metrics.properties kms-env.sh mapred-site.xml.template
[root@master hadoop]# vi slaves
slave1
slave2
slave3
slave4
2.在slave1节点上修改slaves文件,并复制到其他节点上
[root@slave1 hadoop]# scp -r slaves slave2:/usr/hadoop/etc/hadoop
slaves
3.在master节点上删除日志信息
[root@master hadoop]# ls
bin etc include lib libexec LICENSE.txt logs NOTICE.txt README.txt sbin share tmp
[root@master hadoop]# cd logs
[root@master logs]# rm *.log
rm: remove regular file ‘hadoop-root-balancer-master.log’?
rm: remove regular file ‘hadoop-root-namenode-master.log’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.log’?
rm: remove regular file ‘yarn-bigdata-resourcemanager-master.log’?
rm: remove regular file ‘yarn-root-resourcemanager-master.log’?
[root@master logs]#
[root@master logs]# rm *.out.*
rm: remove regular file ‘hadoop-root-balancer-master.out.1’?
rm: remove regular file ‘hadoop-root-balancer-master.out.2’?
rm: remove regular file ‘hadoop-root-namenode-master.out.1’?
rm: remove regular file ‘hadoop-root-namenode-master.out.2’?
rm: remove regular file ‘hadoop-root-namenode-master.out.3’?
rm: remove regular file ‘hadoop-root-namenode-master.out.4’?
rm: remove regular file ‘hadoop-root-namenode-master.out.5’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.out.1’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.out.2’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.out.3’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.out.4’?
rm: remove regular file ‘hadoop-root-secondarynamenode-master.out.5’?
rm: remove regular file ‘yarn-bigdata-resourcemanager-master.out.1’?
rm: remove regular file ‘yarn-root-resourcemanager-master.out.1’?
rm: remove regular file ‘yarn-root-resourcemanager-master.out.2’?
rm: remove regular file ‘yarn-root-resourcemanager-master.out.3’?
[root@master logs]# for i in `find . -name "*.log" -o -name "*.out"`;do cat /dev/null >$i;done
4.分别在slave3和slave4节点上执行
[root@slave3 hadoop]# cd sbin
[root@slave3 sbin]# ls
distribute-exclude.sh mr-jobhistory-daemon.sh start-dfs.sh stop-dfs.cmd
hadoop-daemon.sh refresh-namenodes.sh start-secure-dns.sh stop-dfs.sh
hadoop-daemons.sh slaves.sh start-yarn.cmd stop-secure-dns.sh
hdfs-config.cmd start-all.cmd start-yarn.sh stop-yarn.cmd
hdfs-config.sh start-all.sh stop-all.cmd stop-yarn.sh
httpfs.sh start-balancer.sh stop-all.sh yarn-daemon.sh
kms.sh start-dfs.cmd stop-balancer.sh yarn-daemons.sh
[root@slave3 sbin]# hadoop-daemon.sh start datanode
starting datanode, logging to /usr/hadoop/logs/hadoop-root-datanode-slave3.out
[root@slave3 sbin]# yarn-daemon.sh start nodemanager
starting nodemanager, logging to /usr/hadoop/logs/yarn-root-nodemanager-slave3.out
[root@slave3 sbin]# jps
7826 Jps
7689 NodeManager
7503 DataNode
5.在master节点上执行
[root@master bin]# hdfs dfsadmin -refreshNodes
Refresh nodes successful
[root@master sbin]# start-balancer.sh
starting balancer, logging to /usr/hadoop/logs/hadoop-root-balancer-master.out
在master节点上重新启动集群即可
参照https://blog.csdn.net/duanbiren123/article/details/80959518
5.再次启动发现master没有Namenode,查看logs文件
[root@master wuxiaoli]# cd /usr/hadoop
[root@master hadoop]# ls
bin etc include lib libexec LICENSE.txt logs NOTICE.txt README.txt sbin share tmp
[root@master hadoop]# cd logs
[root@master logs]# ls
hadoop-root-balancer-master.log SecurityAuth-root.audit
hadoop-root-balancer-master.out yarn-bigdata-resourcemanager-master.log
hadoop-root-balancer-master.out.1 yarn-bigdata-resourcemanager-master.out
hadoop-root-balancer-master.out.2 yarn-bigdata-resourcemanager-master.out.1
hadoop-root-namenode-master.log yarn-root-resourcemanager-master.log
hadoop-root-namenode-master.out yarn-root-resourcemanager-master.out
hadoop-root-namenode-master.out.1 yarn-root-resourcemanager-master.out.1
hadoop-root-namenode-master.out.2 yarn-root-resourcemanager-master.out.2
hadoop-root-namenode-master.out.3 yarn-root-resourcemanager-master.out.3
hadoop-root-namenode-master.out.4 yarn-root-resourcemanager-master.out.4
hadoop-root-namenode-master.out.5 yarn-root-resourcemanager-master.out.5
hadoop-root-secondarynamenode-master.log yarn-wuxiaoli-resourcemanager-master.log
hadoop-root-secondarynamenode-master.out yarn-wuxiaoli-resourcemanager-master.out
hadoop-root-secondarynamenode-master.out.1 yarn-wuxiaoli-resourcemanager-master.out.1
hadoop-root-secondarynamenode-master.out.2 yarn-wuxiaoli-resourcemanager-master.out.2
hadoop-root-secondarynamenode-master.out.3 yarn-wuxiaoli-resourcemanager-master.out.3
hadoop-root-secondarynamenode-master.out.4 yarn-wuxiaoli-resourcemanager-master.out.4
hadoop-root-secondarynamenode-master.out.5 yarn-wuxiaoli-resourcemanager-master.out.5
[root@master logs]# tail -20 hadoop-root-namenode-master.log
2019-06-10 05:51:47,233 INFO org.apache.hadoop.metrics2.impl.MetricsSystemImpl: Stopping NameNode metrics system...
2019-06-10 05:51:47,234 INFO org.apache.hadoop.metrics2.impl.MetricsSystemImpl: NameNode metrics system stopped.
2019-06-10 05:51:47,234 INFO org.apache.hadoop.metrics2.impl.MetricsSystemImpl: NameNode metrics system shutdown complete.
2019-06-10 05:51:47,234 ERROR org.apache.hadoop.hdfs.server.namenode.NameNode: Failed to start namenode.
java.io.IOException: Failed to load FSImage file, see error(s) above for more info.
at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:732)
at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:316)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1044)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:707)
at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:635)
at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:696)
at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:906)
at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:885)
at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1626)
at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1694)
2019-06-10 05:51:47,235 INFO org.apache.hadoop.util.ExitUtil: Exiting with status 1
2019-06-10 05:51:47,238 INFO org.apache.hadoop.hdfs.server.namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at master/192.168.79.11
************************************************************/
[root@master logs]#
格式化Namenode后,master节点上有了namenode但是slave节点上,没有了datanode
6.执行stop-all.sh
slave1: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave2: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave3: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave4: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
7.重新启动集群,还是没有datanode,在slave1节点上,查看日志信息
[root@slave1 logs]# tail -30 hadoop-root-datanode-slave1.log
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:266)
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:750)
at java.lang.Thread.run(Thread.java:748)
2019-06-10 06:48:16,849 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Ending block pool service for: Block pool <registering> (Datanode Uuid 53abf1e0-86b0-4b9e-b64b-3b7fa7633fb2) service to master/192.168.79.11:9000
2019-06-10 06:48:16,954 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool ID needed, but service not yet registered with NN, trace:
java.lang.Exception
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.getBlockPoolId(BPOfferService.java:210)
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.hasBlockPoolId(BPOfferService.java:220)
at org.apache.hadoop.hdfs.server.datanode.BlockPoolManager.remove(BlockPoolManager.java:90)
at org.apache.hadoop.hdfs.server.datanode.DataNode.shutdownBlockPool(DataNode.java:1490)
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.shutdownActor(BPOfferService.java:465)
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.cleanUp(BPServiceActor.java:527)
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:787)
at java.lang.Thread.run(Thread.java:748)
2019-06-10 06:48:16,955 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Removed Block pool <registering> (Datanode Uuid 53abf1e0-86b0-4b9e-b64b-3b7fa7633fb2)
2019-06-10 06:48:16,955 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Block pool ID needed, but service not yet registered with NN, trace:
java.lang.Exception
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.getBlockPoolId(BPOfferService.java:210)
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.hasBlockPoolId(BPOfferService.java:220)
at org.apache.hadoop.hdfs.server.datanode.DataNode.shutdownBlockPool(DataNode.java:1491)
at org.apache.hadoop.hdfs.server.datanode.BPOfferService.shutdownActor(BPOfferService.java:465)
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.cleanUp(BPServiceActor.java:527)
at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:787)
at java.lang.Thread.run(Thread.java:748)
2019-06-10 06:48:18,955 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Exiting Datanode
2019-06-10 06:48:18,958 INFO org.apache.hadoop.util.ExitUtil: Exiting with status 0
2019-06-10 06:48:18,962 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down DataNode at slave1/192.168.79.22
************************************************************/
7.参照https://www.oipapio.com/cn/article-2932995
问题因为多次对namenode进行format,每一次format主节点NameNode产生新的clusterID、namespaceID,于是导致主节点的clusterID、namespaceID与各个子节点DataNode不一致。
当format过后再启动hadoop,hadoop尝试创建新的current目录,但是由于已存在current目录,导致创建失败,最终引起DataNode节点的DataNode进程启动失败,从而引起hadoop集群完全启动失败。
因此可以通过直接删除数据节点DataNode的current文件夹,进行解决该问题。
解决办法:
删除tmp/dfs/data下的current文件夹,然后重新启动hadoop。
[root@slave2 hadoop]# ls
bin etc include lib libexec LICENSE.txt logs NOTICE.txt README.txt sbin share tmp
[root@slave2 hadoop]# cd tmp
[root@slave2 tmp]# ls
dfs nm-local-dir
[root@slave2 tmp]# cd dfs
[root@slave2 dfs]# ls
data name namesecondary
[root@slave2 dfs]# cd data
[root@slave2 data]# ls
current
[root@slave2 data]# rm -rf current
[root@slave2 data]#
8.停止进程时出现
slave4: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave2: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave1: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
slave3: nodemanager did not stop gracefully after 5 seconds: killing with kill -9
9.hbase新增节点也需要清楚hbase目录下的logs文件中的日志