Skip to content

Instantly share code, notes, and snippets.

@momijiame
Created December 17, 2024 01:10
Show Gist options
  • Save momijiame/eb4b9f6dfd909b2ab8c9972d74aab28d to your computer and use it in GitHub Desktop.
Save momijiame/eb4b9f6dfd909b2ab8c9972d74aab28d to your computer and use it in GitHub Desktop.
Vagrantfile for Hadoop (3.3) Cluster with Hive (4.0) and Spark (3.5)
#!/bin/sh
set -euxo pipefail
vagrant up node1
vagrant up node2
vagrant up master
#!/bin/sh
set -euxo pipefail
HOSTNAME=$1
: "Set hostname" && {
sudo hostname $HOSTNAME
echo $HOSTNAME | sudo tee /etc/hostname > /dev/null
}
: "Edit hosts file" && {
if ! bash -c "grep 192.168.56.10 /etc/hosts" ; then
cat << 'EOF' | sudo tee -a /etc/hosts > /dev/null
192.168.56.10 master
192.168.56.11 node1
192.168.56.12 node2
EOF
fi
}
: "Install common packages" && {
sudo yum -y install epel-release
sudo yum -y install java-1.8.0-openjdk-devel openssh-clients rsync wget sshpass
}
: "Download Hadoop" && {
if ! bash -c "ls | grep hadoop-*.tar.gz"; then
wget http://ftp.riken.jp/net/apache/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz -nv
fi
tar xkf hadoop-3.3.6.tar.gz
}
: "Set environment variables to shell RC file" && {
if ! bash -c "grep JAVA_HOME ~/.bashrc"; then
cat << 'EOF' >> ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk
export HADOOP_HOME=~/hadoop-3.3.6
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$JAVA_HOME/bin:$PATH
EOF
fi
set +u
source ~/.bashrc
set -u
}
: "Hadoop execution check" && {
hadoop version
}
: "Install SSH public key to all nodes" && {
ssh-keygen -t ed25519 -P '' -f ~/.ssh/id_ed25519
for node in master node1 node2; do
sshpass -p "vagrant" ssh-copy-id -i ~/.ssh/id_ed25519.pub -o "StrictHostKeyChecking no" $node
done;
}
: "Copy Hadoop directory to nodes" && {
for node in node1 node2; do
scp -r $HADOOP_HOME $node:~/
done;
}
: "Setting configuration files" && {
: "etc/hadoop/workers" && {
cat << 'EOF' > $HADOOP_HOME/etc/hadoop/workers
node1
node2
EOF
}
: "etc/hadoop/core-site.xml" && {
if ! bash -c "grep fs.defaultFS $HADOOP_HOME/etc/hadoop/core-site.xml"; then
cat << 'EOF' > /tmp/core-site.xml.property
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:9000</value>
</property>
EOF
sed -i -e '
/^<configuration>$/r /tmp/core-site.xml.property
/^$/d
' $HADOOP_HOME/etc/hadoop/core-site.xml
fi
}
: "etc/hadoop/hdfs-site.xml" && {
if ! bash -c "grep dfs.replication $HADOOP_HOME/etc/hadoop/hdfs-site.xml" ; then
cat << 'EOF' > /tmp/hdfs-site.xml.property
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>master:50090</value>
</property>
EOF
sed -i -e '
/^<configuration>$/r /tmp/hdfs-site.xml.property
/^$/d
' $HADOOP_HOME/etc/hadoop/hdfs-site.xml
fi
}
: "etc/hadoop/mapred-site.xml" && {
if ! bash -c "grep mapreduce.framework.nam $HADOOP_HOME/etc/hadoop/mapred-site.xml"; then
cat << 'EOF' > /tmp/mapred-site.xml.property
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
EOF
sed -i -e '
/^<configuration>$/r /tmp/mapred-site.xml.property
/^$/d
' $HADOOP_HOME/etc/hadoop/mapred-site.xml
fi
}
: "etc/hadoop/yarn-site.xml" && {
if ! bash -c "grep yarn.nodemanager.aux-service $HADOOP_HOME/etc/hadoop/yarn-site.xml"; then
cat << 'EOF' > /tmp/yarn-site.xml.property
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
EOF
sed -i -e '
/^<configuration>$/r /tmp/yarn-site.xml.property
/^$/d
' $HADOOP_HOME/etc/hadoop/yarn-site.xml
fi
}
: "Copy to workers" && {
for node in node1 node2; do
scp -r $HADOOP_HOME/etc/hadoop/* $node:$HADOOP_HOME/etc/hadoop/
done;
}
}
: "Format HDFS" && {
$HADOOP_HOME/bin/hdfs namenode -format -force
}
: "Start daemons" && {
: "HDFS" && {
if ! bash -c "jps | grep NameNode"; then
$HADOOP_HOME/sbin/start-dfs.sh
fi
}
: "YARN" && {
if ! bash -c "jps | grep ResourceManager"; then
$HADOOP_HOME/sbin/start-yarn.sh
fi
}
: "MapReduce JobHistory server" && {
if ! bash -c "jps | grep JobHistoryServer" ; then
$HADOOP_HOME/bin/mapred --daemon start historyserver
fi
}
: "Check YARN" && {
$HADOOP_HOME/bin/yarn app -list
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar pi 10 1000
}
}
: "Setup Hive" && {
: "Download Hive" && {
if ! bash -c "ls | grep apache-hive-*.tar.gz" ; then
wget http://ftp.riken.jp/net/apache/hive/hive-4.0.1/apache-hive-4.0.1-bin.tar.gz -nv
fi
tar xkf apache-hive-4.0.1-bin.tar.gz
}
: "Set environment variables to shell RC file" && {
if ! bash -c "grep HIVE_HOME ~/.bashrc" ; then
cat << 'EOF' >> ~/.bashrc
export HIVE_HOME=~/apache-hive-4.0.1-bin
export PATH=$HIVE_HOME/bin:$PATH
EOF
fi
set +u
source ~/.bashrc
set -u
}
: "Setup MetaStore backend RDB" && {
: "Install MariaDB" && {
sudo yum -y install mariadb-server mariadb-java-client
}
: "Add [mysqld] sectin if not exists" && {
MARIADB_CONFIG_FILE="/etc/my.cnf.d/mariadb-server.cnf"
if ! grep -q '^\[mysqld\]' "$MARIADB_CONFIG_FILE"; then
echo -e "\n[mysqld]" >> "$MARIADB_CONFIG_FILE"
fi
}
: "Add or replace 'sql_mode'" && {
MARIADB_SQL_MODE="STRICT_TRANS_TABLES,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION,ANSI_QUOTES,NO_BACKSLASH_ESCAPES"
if grep -q '^\s*sql_mode' "$MARIADB_CONFIG_FILE"; then
sudo sed -i "/^\s*sql_mode/c\sql_mode=${MARIADB_SQL_MODE}" "$MARIADB_CONFIG_FILE"
else
sudo sed -i "/^\[mysqld\]/a\sql_mode=${MARIADB_SQL_MODE}" "$MARIADB_CONFIG_FILE"
fi
}
: "Enable and start MariaDB" && {
sudo systemctl enable mariadb
sudo systemctl start mariadb
}
: "Create MariaDB user for Hive" && {
sudo mysql -e "
CREATE USER 'hive'@'%' IDENTIFIED BY 'hive';
GRANT ALL PRIVILEGES ON *.* TO 'hive'@'%'; FLUSH PRIVILEGES;
"
}
: "Create JDBC driver symlink" && {
ln -s /usr/lib/java/mariadb-java-client.jar $HIVE_HOME/lib
}
}
: "Setting configuration files" && {
: "conf/hive-site.xml" && {
if ! bash -c "grep hive.server2.authentication $HIVE_HOME/conf/hive-site.xml"; then
cat << 'EOF' > $HIVE_HOME/conf/hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mariadb://localhost/metastore?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.mariadb.jdbc.Driver</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>false</value>
</property>
</configuration>
EOF
fi
}
}
: "Setup HDFS working directory" && {
$HADOOP_HOME/bin/hadoop fs -mkdir -p /user/hive/warehouse
$HADOOP_HOME/bin/hadoop fs -chmod g+w /user/hive/warehouse
$HADOOP_HOME/bin/hadoop fs -mkdir -p /tmp
$HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
$HIVE_HOME/bin/schematool -dbType mysql -initSchema --verbose
$HIVE_HOME/bin/schematool -dbType mysql -info --verbose
}
: "Start server process" && {
nohup $HIVE_HOME/bin/hive --service metastore &
sleep 5
nohup $HIVE_HOME/bin/hive --service hiveserver2 &
sleep 5
}
: "Check Hive" && {
$HIVE_HOME/bin/beeline -u jdbc:hive2://localhost:10000 -e "select 1"
}
}
: "Setup Spark" && {
: "Download Spark" && {
if ! bash -c "ls | grep spark-*.tar.gz" ; then
wget -nv https://ftp.riken.jp/net/apache/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
fi
tar xkf spark-3.5.3-bin-hadoop3.tgz
}
: "Set environment variables to shell RC file" && {
if ! bash -c "grep SPARK_HOME ~/.bashrc" ; then
cat << 'EOF' >> ~/.bashrc
export SPARK_HOME=~/spark-3.5.3-bin-hadoop3
export PATH=$SPARK_HOME/bin:$PATH
EOF
fi
set +u
source ~/.bashrc
set -u
}
: "Create JDBC driver symlink" && {
ln -s /usr/lib/java/mariadb-java-client.jar $SPARK_HOME/jars/
}
: "Check Spark" && {
$SPARK_HOME/bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
$SPARK_HOME/examples/jars/spark-examples_*.jar \
10
}
}
#!/bin/sh
set -euxo pipefail
HOSTNAME=$1
: "Set hostname" && {
sudo hostname $HOSTNAME
echo $HOSTNAME | sudo tee /etc/hostname > /dev/null
}
: "Edit hosts file" && {
if ! bash -c "grep 192.168.56.10 /etc/hosts" ; then
cat << 'EOF' | sudo tee -a /etc/hosts > /dev/null
192.168.56.10 master
192.168.56.11 node1
192.168.56.12 node2
EOF
fi
}
: "Install common packages" && {
sudo yum -y install java-1.8.0-openjdk-devel openssh-clients rsync wget
}
: "Set environment variables to shell RC file" && {
if ! bash -c "grep JAVA_HOME /etc/hosts" ; then
cat << 'EOF' >> ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk
export HADOOP_HOME=~/hadoop-3.3.6
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$JAVA_HOME/bin:$PATH
EOF
fi
}
# -*- mode: ruby -*-
# vi: set ft=ruby :
# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
VAGRANTFILE_API_VERSION = "2"
Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
config.vm.define :master, primary: true do |master|
master.vm.box = "bento/rockylinux-9"
master.vm.network "private_network", ip: "192.168.56.10"
master.vm.provider "virtualbox" do |vb|
vb.memory = "8192"
end
master.vm.provision "shell", privileged: false do |s|
s.path = "master.sh"
s.args = "master"
end
end
(1..2).each {|i|
node_name = "node" + i.to_s
config.vm.define node_name do |node|
node.vm.box = "bento/rockylinux-9"
node.vm.network "private_network", ip: "192.168.56.1" + i.to_s
node.vm.provider "virtualbox" do |vb|
vb.memory = "4096"
end
node.vm.provision "shell", privileged: false do |s|
s.path = "nodes.sh"
s.args = node_name
end
end
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment