docker 构建spark集群

来源:互联网 发布:8051单片机复位电路 编辑:程序博客网 时间:2024/05/22 03:43
1. 准备工具
Hadoop-2.8.0.tar.gz,scala-2.12.2.tgz,spark-2.1.1-bin-hadoop2.7.tgz,zookeeper-3.4.6.tar.gz,jdk-8u73-linux-x64.tar.gz

2. 安装工具
一些工具都放到~/software目录下
docker pull ubuntu:14.04
docker run –name ubuntu_spark –h ubuntu  -v ~/software/:/software  –it ubuntu:14.04

#给容器安装软件
apt-get -y update 
apt-get install ssh
apt-get install vim
vim ~/.bashrc
#加入启动ssh脚本
/usr/sbin/sshd
#退出保存


#设置ssh使用root用户登录
vim /etc/ssh/sshd_config中PermitRootLogin no 改为yes


#生产访问秘钥:
cd ~/
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cd .ssh
cat id_rsa.pub >> authorized_keys
#开启ssh服务后验证是否可以使用,打印出当前时间
ssh localhost date


#安装jdk
cd /software
tar -zxvf jdk-8u73-linux-x64.tar.gz
mv jdk1.8.73 /usr/local/java
vim ~/.bashrc
#新增
export JAVA_HOME=/usr/local/java
export PATH=$PATH:$JAVA_HOME/bin
#验证是否成功
java --version




#安装scala
tar -zxvf scala-2.12.2.tgz
mv scala-2.12.2 /usr/local/scala
vim ~/.bashrc
#新增
export SCALA_HOME=/usr/local/scala
export PATH=$PATH:$SCALA_HOME/bin


#安装zookeeper
tar -zxvf zookeeper-3.4.6.tar.gz
mv zookeeper-3.4.6 ~/zookeeper
cd ~/zookeeper/conf/


cp zoo_sample.cfg zoo.cfg
vim zoo.cfg
#修改:
dataDir=/root/zookeeper/tmp
#在最后添加:
server.1=cloud1:2888:3888
server.2=cloud2:2888:3888
server.3=cloud3:2888:3888
#退出保存后,创建一个文件夹
mkdir ~/zookeeper/tmp
#再创建一个空文件
touch ~/zookeeper/tmp/myid
#最后向该文件写入ID
echo 1 > ~/zookeeper/tmp/myid




#安装hadoop
cd ~/software
tar -zxvf Hadoop-2.8.0.tar.gz
mv hadoop-2.8.0 ~/hadoop
vim ~/.bashrc
#新增
export HADOOP_HOME=/root/hadoop
export PATH=$PATH:$HADOOP_HOME/bin
#退出保存


#修改hadoop相关配置
vim ~/hadoop/etc/hadoop/hadoop-env.sh
#修改JAVA_HOME
export JAVA_HOME=/usr/local/java




vim ~/hadoop/etc/hadoop/core-site.xml
#新增
<property>
    <name>fs.defaultFS</name>
    <value>hdfs://ns1</value>
</property>
<property>
    <name>hadoop.tmp.dir</name>
    <value>/root/hadoop/tmp</value>
</property>
<property>
    <name>ha.zookeeper.quorum</name> 
    <value>cloud1:2181,cloud2:2181,cloud3:2181</value>
</property>




vim ~/hadoop/etc/hadoop/hdfs-site.xml
#新增
<property>
    <name>dfs.nameservices</name>
    <value>ns1</value>
</property>
<property>
    <name>dfs.ha.namenodes.ns1</name>
    <value>nn1,nn2</value>
</property>
<property>
    <name>dfs.namenode.rpc-address.ns1.nn1</name>
    <value>cloud1:9000</value>
</property>
<property>
    <name>dfs.namenode.http-address.ns1.nn1</name>
    <value>cloud1:50070</value>
</property>
<property>
    <name>dfs.namenode.rpc-address.ns1.nn2</name>
    <value>cloud2:9000</value>
</property>
<property>
    <name>dfs.namenode.http-address.ns1.nn2</name>
    <value>cloud2:50070</value>
</property>
<property>
    <name>dfs.namenode.shared.edits.dir</name> 
    <value>qjournal://cloud1:8485;cloud2:8485;cloud3:8485/ns1</value>
</property>
<property>
    <name>dfs.journalnode.edits.dir</name>
    <value>/root/hadoop/journal</value>
</property>
<property>
    <name>dfs.ha.automatic-failover.enabled</name>
    <value>true</value>
</property>
<property>
    <name>dfs.client.failover.proxy.provider.ns1</name>
    <value>
    org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
    </value>
</property>
<property>
    <name>dfs.ha.fencing.methods</name>
    <value>
    sshfence
    shell(/bin/true)
    </value>
</property>
<property>
    <name>dfs.ha.fencing.ssh.private-key-files</name>
    <value>/root/.ssh/id_rsa</value>
</property>
<property>
    <name>dfs.ha.fencing.ssh.connect-timeout</name>
    <value>30000</value>
</property>




vim ~/hadoop/etc/hadoop/yarn-site.xml
#新增
<property>
    <name>yarn.resourcemanager.hostname</name>
    <value>cloud1</value>
</property>
<property> 
    <name>yarn.nodemanager.aux-services</name> 
    <value>mapreduce_shuffle</value> 
</property>




vim ~/hadoop/etc/hadoop/slaves
cloud1
cloud2
cloud3


#Spark安装及配置
cd ~/software
tar -xzvf spark-2.1.1-bin-hadoop2.7.tgz
mv spark-2.1.1-bin-hadoop2.7 ~/spark


vim ~/.bashrc
#新增
export SPARK_HOME=/root/spark
export PATH=$PATH:$SPARK_HOME/bin
#退出保存
cd ~/spark
cp ~/spark/conf/spark-env.sh.template ~/spark/conf/spark-env.sh
vim ~/spark/conf/spark-env.sh
#新增
export SPARK_MASTER_IP=cloud1 
export SPARK_WORKER_MEMORY=128m 
export JAVA_HOME=/usr/local/java 
export SCALA_HOME=/usr/local/scala 
export SPARK_HOME=/root/spark 
export HADOOP_CONF_DIR=/root/hadoop/etc/hadoop 
export SPARK_LIBRARY_PATH=$$SPARK_HOME/lib 
export SCALA_LIBRARY_PATH=$SPARK_LIBRARY_PATH 
export SPARK_WORKER_CORES=1 
export SPARK_WORKER_INSTANCES=1 
export SPARK_MASTER_PORT=7077


vim ~/spark/conf/slaves
#新增
cloud1
cloud2
cloud3


#在宿主机上提交ubuntu_spark容器为新的镜像,并打其标签为spark:
#提交ubuntu_spark容器,命令返回新镜像的编号
docker commit ubuntu_spark
#为新镜像打标签为spark
docker tag <mirror id> spark:1.0


docker run --name cloud1 -h cloud1 --add-host cloud1:172.17.0.3 --add-host cloud2:172.17.0.4 --add-host cloud3:172.17.0.5 -p 50070:50070 -p 8080:8080 -p 8088:8088 -it spark:1.0


docker run --name cloud2 -h cloud2 --add-host cloud1:172.17.0.3 --add-host cloud2:172.17.0.4 --add-host cloud3:172.17.0.5  -it spark:1.0


docker run --name cloud3 -h cloud3 --add-host cloud1:172.17.0.3 --add-host cloud2:172.17.0.4 --add-host cloud3:172.17.0.5  -it spark:1.0


#在cloud2执行
echo 2 > ~/zookeeper/tmp/myid
#在cloud3执行
echo 3 > ~/zookeeper/tmp/myid


#在所有的cloud1,cloud2,cloud3执行
~/zookeeper/bin/zkServer.sh start


#查看zookeeper运行状态
~/zookeeper/bin/zkServer.sh status


#cloud1,cloud2,cloud3实现免登陆,这里不写操作


#启动journalnode(在cloud1上启动所有journalnode,注意:是调用的hadoop-daemons.sh这个脚本,注意是复数s的那个脚本)
#在cloud1里执行
~/hadoop/sbin/hadoop-daemons.sh start journalnode
#格式化HDFS
~/hadoop/bin/hdfs namenode -format
#格式化ZK
~/hadoop/bin/hdfs zkfc -formatZK
#启动HDFS
~/hadoop/sbin/start-dfs.sh
~/hadoop/sbin/start-yarn.sh
#启动spark集群
~/spark/sbin/start-all.sh


启动之后可以在宿主机的浏览器中访问 
HDFS:cloud1:50070 
YARN:cloud1:8088 

SPARK:cloud1:8080

参考资料:

http://blog.csdn.net/qq1010885678/article/details/46353101

http://www.cnblogs.com/jasonfreak/p/5391190.html