From f4c8ed531f3581db3697fe5d2681e4ae64ebeaf1 Mon Sep 17 00:00:00 2001
From: Decoupes Remy <remy.decoupes@irstea.fr>
Date: Mon, 9 Dec 2019 17:43:12 +0100
Subject: [PATCH] start HDFS cluster : still some issues in configuration

---
 README.md                                     |  25 ++++-
 playbook/install-datanode.yml                 |   4 +-
 playbook/install-mononode.yml                 |   2 +-
 playbook/install-namenode.yml                 |   4 +-
 playbook/roles/hadoop-common/tasks/main.yml   |  74 +++++++------
 .../hadoop-common/templates/core-site.j2      |   2 +-
 .../hadoop-common/templates/hdfs-site.j2      |   2 +-
 .../hadoop-common/templates/mapred-site.j2    |   2 +-
 .../roles/hadoop-common/templates/workers.j2  |   5 +
 playbook/roles/hadoop-common/vars/main.yml    |   3 +-
 playbook/roles/hadoop-mononode/tasks/main.yml | 104 ++++++++++++++++++
 playbook/roles/hadoop-namenode/tasks/main.yml |   7 ++
 playbook/roles/hosts-file/vars/main.yml       |   3 +
 vagrant/cluster/Vagrantfile                   |   8 +-
 14 files changed, 197 insertions(+), 48 deletions(-)
 create mode 100644 playbook/roles/hadoop-common/templates/workers.j2
 create mode 100644 playbook/roles/hadoop-namenode/tasks/main.yml

diff --git a/README.md b/README.md
index 7c451c1..442f641 100644
--- a/README.md
+++ b/README.md
@@ -83,8 +83,27 @@ Then run the script [ansible-launch.sh](ansible-launch.sh) :
 1. Set your nodes' IP address in [VagrantFile](vagrant/cluster/Vagrantfile)
 2. Declare those IP for ansible provision in [vars](playbook/roles/hosts-file/vars/main.yml)
 3. in cli : start your multiple VM from this [directory : vagrant/cluster](vagrant/cluster) :
-```shell
-vagrant up
-```
+	```shell
+	vagrant up
+	```
+4. Format HDFS :
+ 	* ssh on namenode
+ 	* in cli : as user hadoop : change directory & format HDFS
+ 	```shell
+ 	sudo su hadoop
+ 	cd /usr/local/hadoop/bin/
+ 	hdfs namenode -format
+ 	```
+ 5. Start HDFS deamon on your cluser
+  	* ssh on namenode
+ 	* in cli : as root : start service hadoop
+ 	 ```shell
+ 	sudo systemctl start hadoop
+ 	```
+ 	* **WORK In Progress** : systemd will tell you something wrong happens but cluster is working anyway.
+ 6. Verify your cluster is up: 
+ 	* on your own device, use a webbrowser
+ 	* go on [IP-of-your-namenode]:9870
+ 	if default : http://10.0.0.10:9870
 ## Deploy cluster HDFS on servers
 work in progress
\ No newline at end of file
diff --git a/playbook/install-datanode.yml b/playbook/install-datanode.yml
index d5f94ba..90e32f2 100644
--- a/playbook/install-datanode.yml
+++ b/playbook/install-datanode.yml
@@ -5,4 +5,6 @@
   roles:
     - common
     - hosts-file
-    - hadoop-common
\ No newline at end of file
+    - role: hadoop-common
+      vars_files: 
+        - playbook/roles/hosts-file/vars/main.yml
\ No newline at end of file
diff --git a/playbook/install-mononode.yml b/playbook/install-mononode.yml
index 5937a0b..93a5c50 100644
--- a/playbook/install-mononode.yml
+++ b/playbook/install-mononode.yml
@@ -4,4 +4,4 @@
   
   roles:
     - common
-    - hadoop-common
+    - hadoop-mononode
diff --git a/playbook/install-namenode.yml b/playbook/install-namenode.yml
index 23ac001..c5d3d7b 100644
--- a/playbook/install-namenode.yml
+++ b/playbook/install-namenode.yml
@@ -5,4 +5,6 @@
   roles:
     - common
     - hosts-file
-    - hadoop-common
\ No newline at end of file
+    - role: hadoop-common
+      vars_files: 
+        - playbook/roles/hosts-file/vars/main.yml
\ No newline at end of file
diff --git a/playbook/roles/hadoop-common/tasks/main.yml b/playbook/roles/hadoop-common/tasks/main.yml
index fc3d342..c520745 100644
--- a/playbook/roles/hadoop-common/tasks/main.yml
+++ b/playbook/roles/hadoop-common/tasks/main.yml
@@ -102,43 +102,51 @@
     path: "{{ hadoopDir }}/etc/hadoop/hadoop-env.sh"
     block: "export JAVA_HOME={{ javahome }}"
 
-# - name: configure core-site.xml
-#   become: yes
-#   template:
-#     src: templates/core-site.j2
-#     dest: "{{ hadoopDir }}/etc/hadoop/core-site.xml"
-#     owner: hadoop
-#     group: hadoop
+- name: configure hdfs-site.xml
+  become: yes
+  template:
+    src: templates/hdfs-site.j2
+    dest: "{{ hadoopDir }}/etc/hadoop/hdfs-site.xml"
+    owner: hadoop
+    group: hadoop
 
-# - name: configure hdfs-site.xml
-#   become: yes
-#   template:
-#     src: templates/hdfs-site.j2
-#     dest: "{{ hadoopDir }}/etc/hadoop/hdfs-site.xml"
-#     owner: hadoop
-#     group: hadoop
+- name: configure core-site.xml
+  become: yes
+  template:
+    src: templates/core-site.j2
+    dest: "{{ hadoopDir }}/etc/hadoop/core-site.xml"
+    owner: hadoop
+    group: hadoop
 
-# - name: configure mapred-site.xml
-#   become: yes
-#   template:
-#     src: templates/mapred-site.j2
-#     dest: "{{ hadoopDir }}/etc/hadoop/mapred-site.xml"
-#     owner: hadoop
-#     group: hadoop
+- name: configure mapred-site.xml
+  become: yes
+  template:
+    src: templates/mapred-site.j2
+    dest: "{{ hadoopDir }}/etc/hadoop/mapred-site.xml"
+    owner: hadoop
+    group: hadoop
 
-# - name: copy hadoop service file
-#   become: yes
-#   template:
-#     src: templates/hadoop.service.j2
-#     dest: /etc/systemd/system/hadoop.service
+- name: configure /etc/hadoop/workers
+  become: yes
+  template:
+    src: templates/workers.j2
+    dest: "{{ hadoopDir }}/etc/hadoop/workers"
+    owner: hadoop
+    group: hadoop
 
-# - name: enable hadoop service
-#   become: yes
-#   service:
-#     daemon_reload: yes
-#     name: hadoop
-#     state: stopped
-#     enabled: yes
+- name: copy hadoop service file
+  become: yes
+  template:
+    src: templates/hadoop.service.j2
+    dest: /etc/systemd/system/hadoop.service
+
+- name: enable hadoop service
+  become: yes
+  service:
+    daemon_reload: yes
+    name: hadoop
+    state: stopped
+    enabled: yes
 
 # - name: HDFS has been already formatted ?
 #   become: yes
diff --git a/playbook/roles/hadoop-common/templates/core-site.j2 b/playbook/roles/hadoop-common/templates/core-site.j2
index ba76ece..a43b22a 100644
--- a/playbook/roles/hadoop-common/templates/core-site.j2
+++ b/playbook/roles/hadoop-common/templates/core-site.j2
@@ -1,6 +1,6 @@
 <configuration>
 	<property>
 		<name>fs.default.name</name>
-		<value>hdfs://localhost:9000</value>
+		<value>hdfs://namenode:9000</value>
 	</property>
 </configuration>
\ No newline at end of file
diff --git a/playbook/roles/hadoop-common/templates/hdfs-site.j2 b/playbook/roles/hadoop-common/templates/hdfs-site.j2
index 84f3842..9342205 100644
--- a/playbook/roles/hadoop-common/templates/hdfs-site.j2
+++ b/playbook/roles/hadoop-common/templates/hdfs-site.j2
@@ -1,6 +1,6 @@
 <configuration>
 	<property>
 		<name>dfs.replication</name>
-		<value>1</value>
+		<value>{{nbOfClusterDataNode}}</value>
 	</property>
 </configuration>
diff --git a/playbook/roles/hadoop-common/templates/mapred-site.j2 b/playbook/roles/hadoop-common/templates/mapred-site.j2
index 74e723d..89adbd8 100644
--- a/playbook/roles/hadoop-common/templates/mapred-site.j2
+++ b/playbook/roles/hadoop-common/templates/mapred-site.j2
@@ -1,4 +1,4 @@
 <property>
 <name>mapred.job.tracker</name>
-<value>localhost:9001</value>
+<value>{{localhost}}:9001</value>
 </property>
\ No newline at end of file
diff --git a/playbook/roles/hadoop-common/templates/workers.j2 b/playbook/roles/hadoop-common/templates/workers.j2
new file mode 100644
index 0000000..4599c1a
--- /dev/null
+++ b/playbook/roles/hadoop-common/templates/workers.j2
@@ -0,0 +1,5 @@
+{% for node in cluster %}
+{% if not 'namenode' in node.role %}
+{{node.hostname}}
+{% endif %}
+{% endfor %}
\ No newline at end of file
diff --git a/playbook/roles/hadoop-common/vars/main.yml b/playbook/roles/hadoop-common/vars/main.yml
index 30c8624..ce8cae9 100644
--- a/playbook/roles/hadoop-common/vars/main.yml
+++ b/playbook/roles/hadoop-common/vars/main.yml
@@ -2,4 +2,5 @@ hadoopVersion: 3.2.1
 hadoopUSRHome: /home/hadoop
 hadoopDir: /usr/local/hadoop/
 javahome: /usr/lib/jvm/java-11-openjdk-amd64
-openjdk8URL : http://security-cdn.debian.org/debian-security/pool/updates/main/o/openjdk-8/openjdk-8-jdk_8u232-b09-1~deb9u1_amd64.deb
\ No newline at end of file
+openjdk8URL : http://security-cdn.debian.org/debian-security/pool/updates/main/o/openjdk-8/openjdk-8-jdk_8u232-b09-1~deb9u1_amd64.deb
+nbOfClusterDataNode: 2
\ No newline at end of file
diff --git a/playbook/roles/hadoop-mononode/tasks/main.yml b/playbook/roles/hadoop-mononode/tasks/main.yml
index 2d66cf0..dfd95a3 100644
--- a/playbook/roles/hadoop-mononode/tasks/main.yml
+++ b/playbook/roles/hadoop-mononode/tasks/main.yml
@@ -1,3 +1,107 @@
+# Playbook based on https://fr.blog.businessdecision.com/tutoriel-cluster-hadoop/
+
+- debug:
+    msg: 
+      - "Hadoop only support java jdk 8, see https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions when it'll support jdk 11"
+      - "Be aware that things may not work when using jdk 11 like explore HDFS using webserver on port 9870"
+      - "Default IPv4 address is : {{ ansible_default_ipv4.address }}"
+
+- name: Set java home as environment variable
+  become: yes
+  apt:
+    name:
+      - openjdk-11-jdk
+
+- name: create hadoop group
+  become: yes
+  group:
+    name: hadoop
+
+- name: create hadoop user
+  become: yes
+  user:
+    name: hadoop
+    group: hadoop
+    home: "{{ hadoopUSRHome }}"
+    createhome: yes
+    system: yes
+
+- name: Set JAVA_HOME as environment variable
+  become: yes
+  become_user : hadoop
+  blockinfile:
+    insertafter: EOF
+    path : ~/.bashrc
+    block: |
+      export JAVA_HOME={{ javahome }}
+      export HADOOP_HOME=/usr/local/hadoop
+      export PATH=$PATH:$HADOOP_HOME/bin
+
+- name: source .bashrc
+  become: yes
+  become_user: hadoop
+  shell: source ~/.bashrc
+  args:
+     executable: /bin/bash
+
+
+- name: create .ssh directory
+  become: yes
+  file:
+    path: "{{ hadoopUSRHome }}/.ssh/"
+    state: directory
+    owner: hadoop
+    group: hadoop
+    mode: 0700
+
+- name: copy ssh key
+  become: yes
+  copy:
+    src: "{{ item }}"
+    dest: "{{ hadoopUSRHome }}/.ssh/"
+    owner: hadoop
+    group: hadoop
+    mode: 0600
+  with_items:
+    - keys/id_rsa
+    - keys/id_rsa.pub
+
+- name: authorized ssh key for hadoop user
+  become: yes
+  authorized_key:
+    user: hadoop
+    state: present
+    key: "{{ lookup('file', 'keys/id_rsa.pub') }}"
+    
+- name: create a tempory directory
+  become: yes
+  file:
+    state: directory
+    path: "{{ hadoopUSRHome }}/tmp"
+
+- name: create a prod directory for hadoop
+  become: yes
+  file:
+    state: directory
+    path: "{{ hadoopDir }}"
+
+- name: "Download and Extract hadoop-{{ hadoopVersion }}"
+  become: yes
+  unarchive: 
+    src: "http://apache.mirrors.ovh.net/ftp.apache.org/dist/hadoop/core/hadoop-{{ hadoopVersion }}/hadoop-{{ hadoopVersion }}.tar.gz"
+    remote_src: yes
+    dest: "{{ hadoopDir }}"
+    extra_opts: [--strip-components=1]
+    owner: hadoop
+    group: hadoop
+
+- name : Set JAVA_HOME in hadoop-env.sh
+  become: yes
+  blockinfile:
+    insertafter: EOF
+    path: "{{ hadoopDir }}/etc/hadoop/hadoop-env.sh"
+    block: "export JAVA_HOME={{ javahome }}"
+
 - name: configure core-site.xml
   become: yes
   template:
diff --git a/playbook/roles/hadoop-namenode/tasks/main.yml b/playbook/roles/hadoop-namenode/tasks/main.yml
new file mode 100644
index 0000000..e27fcbf
--- /dev/null
+++ b/playbook/roles/hadoop-namenode/tasks/main.yml
@@ -0,0 +1,7 @@
+- name: configure core-site.xml
+  become: yes
+  template:
+    src: templates/core-site.j2
+    dest: "{{ hadoopDir }}/etc/hadoop/core-site.xml"
+    owner: hadoop
+    group: hadoop
\ No newline at end of file
diff --git a/playbook/roles/hosts-file/vars/main.yml b/playbook/roles/hosts-file/vars/main.yml
index f74023c..5d039a7 100644
--- a/playbook/roles/hosts-file/vars/main.yml
+++ b/playbook/roles/hosts-file/vars/main.yml
@@ -1,7 +1,10 @@
 cluster:
   - hostname: namenode
+    role: namenode
     IP: 10.0.0.10
   - hostname: datanode1
+    role: datanode
     IP: 10.0.0.11
   - hostname: datanode2
+    role: datanode
     IP: 10.0.0.12
diff --git a/vagrant/cluster/Vagrantfile b/vagrant/cluster/Vagrantfile
index 32160eb..54890f4 100644
--- a/vagrant/cluster/Vagrantfile
+++ b/vagrant/cluster/Vagrantfile
@@ -20,7 +20,7 @@ Vagrant.configure("2") do |config|
 
 	# Number of datanode
 	N = NUMBER_OF_DATANODE
-	(1..2).each do |machine_id|
+	(1..N).each do |machine_id|
 		config.vm.define "datanode#{machine_id}" do |machine|
 			machine.vm.box = "generic/debian10"
 			machine.vm.network "public_network", bridge:"enp1s0", use_dhcp_assigned_default_route: true
@@ -31,10 +31,8 @@ Vagrant.configure("2") do |config|
 			SHELL
 			machine.vm.hostname = "datanode#{machine_id}"
 			machine.vm.network :private_network, ip: "10.0.0.1#{machine_id}"
-			if machine_id == N
-				machine.vm.provision "ansible" do |ansible|
-					ansible.playbook = "../../playbook/install-datanode.yml"
-				end
+			machine.vm.provision "ansible" do |ansible|
+				ansible.playbook = "../../playbook/install-datanode.yml"
 			end
 		end
 	end
-- 
GitLab