Hadoop ZKFC Configure
ZKFC doesnt have any required configuration. By default, it uses the SASL mechanism to connect to zookeeper using kerberos.
Optional, activate digest type access to zookeeper to manage the zkfc znode:
{
"digest": {
"name": "zkfc",
"password": "hdfs123"
}
}
module.exports = (service) ->
options = service.options
Identities
options.hadoop_group = merge {}, service.deps.hadoop_core.options.hadoop_group, options.hadoop_group
options.group = merge {}, service.deps.hadoop_core.options.hdfs.group, options.group
options.user = merge {}, service.deps.hadoop_core.options.hdfs.user, options.user
Environment
# Layout
options.pid_dir ?= service.deps.hadoop_core.options.hdfs.pid_dir
options.log_dir ?= service.deps.hadoop_core.options.hdfs.log_dir
options.conf_dir ?= '/etc/hadoop-hdfs-zkfc/conf'
options.nn_conf_dir ?= service.deps.hdfs_nn_local.options.conf_dir
# Java
options.java_home ?= service.deps.java.options.java_home
options.hadoop_heap ?= service.deps.hadoop_core.options.hadoop_heap
options.hadoop_opts ?= service.deps.hadoop_core.options.hadoop_opts
options.opts ?= ''
# Misc
options.fqdn = service.node.fqdn
options.iptables ?= service.deps.iptables and service.deps.iptables.options.action is 'start'
options.hdfs_krb5_user = service.deps.hadoop_core.options.hdfs.krb5_user
options.clean_logs ?= false
Configuration
options.core_site ?= merge {}, service.deps.hadoop_core.options.core_site, options.core_site or {}
options.core_site['ha.zookeeper.quorum'] ?= service.deps.zookeeper_server
.filter (srv) -> srv.options.config['peerType'] is 'participant'
.map (srv)-> "#{srv.node.fqdn}:#{srv.options.config['clientPort']}"
.join(',')
# Validation
options.principal ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.namenode.kerberos.principal']
options.nn_principal ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.namenode.kerberos.principal']
options.keytab ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.namenode.keytab.file']
options.nn_keytab ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.namenode.keytab.file']
options.jaas_file ?= "#{options.conf_dir}/zkfc.jaas"
options.digest ?= {}
options.digest.name ?= 'zkfc'
options.digest.password ?= null
# Environment
if options.core_site['hadoop.security.authentication'] is 'kerberos'
options.opts = "-Djava.security.auth.login.config=#{options.jaas_file} #{options.opts}"
# Enrich "core-site.xml" with acl and auth
options.core_site['ha.zookeeper.acl'] ?= "@#{options.conf_dir}/zk-acl.txt"
options.core_site['ha.zookeeper.auth'] = "@#{options.conf_dir}/zk-auth.txt"
# Enrich "hdfs-site.xml"
options.hdfs_site ?= {}
options.hdfs_site['dfs.ha.zkfc.port'] ?= '8019'
for property in [
'dfs.namenode.kerberos.principal'
'dfs.namenode.keytab.file'
# 'dfs.namenode.kerberos.internal.spnego.principal'
# 'dfs.namenode.kerberos.https.principal'
# 'dfs.web.authentication.kerberos.principal'
'dfs.ha.automatic-failover.enabled'
'dfs.nameservices'
'dfs.internal.nameservices'
'fs.permissions.umask-mode'
] then options.hdfs_site[property] ?= service.deps.hdfs_nn_local.options.hdfs_site[property]
for property, value of service.deps.hdfs_nn_local.options.hdfs_site
ok = false
ok = true if /^dfs\.namenode\.\w+-address/.test property
# ok = true if property.indexOf('dfs.client.failover.proxy.provider.') is 0
ok = true if property.indexOf('dfs.ha.namenodes.') is 0
continue unless ok
options.hdfs_site[property] ?= value
options.nn_hosts ?= service.deps.hdfs_nn.map( (srv) -> srv.node.fqdn ).join(',')
Kerberos
options.krb5 ?= {}
options.krb5.realm ?= service.deps.krb5_client.options.etc_krb5_conf?.libdefaults?.default_realm
throw Error 'Required Options: "realm"' unless options.krb5.realm
options.krb5.admin ?= service.deps.krb5_client.options.admin[options.krb5.realm]
HA
options.dfs_nameservices ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.nameservices']
options.automatic_failover ?= service.deps.hdfs_nn_local.options.hdfs_site['dfs.ha.automatic-failover.enabled'] is 'true'
options.active_nn_host ?= service.deps.hdfs_nn_local.options.active_nn_host
options.standby_nn_host ?= service.deps.hdfs_nn_local.options.standby_nn_host
options.active_shortname ?= service.instances.filter( (instance) -> instance.node.fqdn is options.active_nn_host )[0].node.hostname
options.standby_shortname ?= service.instances.filter( (instance) -> instance.node.fqdn is options.standby_nn_host )[0].node.hostname
# options.active_shortname = service.deps.hdfs_nn_local.filter( (srv) -> srv.node.fqdn is srv.options.active_nn_host )[0].node.hostname
# options.standby_shortname = service.deps.hdfs_nn_local.filter( (srv) -> srv.node.fqdn is srv.options.standby_nn_host )[0].node.hostname
Fencing
To prevent split-brain scenario, in addition to the Journal Quorum Process for write, sshfence allow ssh connection to the previous disfunctioning active namenode from the new one to "shoot it in the head" (STONITH).
If the previous master machine is dead, ssh connection will fail, so another fencing method should be configured to not block failover.
options.hdfs_site['dfs.ha.fencing.methods'] ?= """
sshfence(#{options.user.name})
shell(/bin/true)
"""
options.hdfs_site['dfs.ha.fencing.ssh.connect-timeout'] ?= '30000'
options.hdfs_site['dfs.ha.fencing.ssh.private-key-files'] ?= "#{options.user.home}/.ssh/id_rsa"
# TODO: keys must be local or remote, only local supported for the moment
throw Error "Required Option: ssh_fencing.private_key" unless options.ssh_fencing.private_key
throw Error "Required Option: ssh_fencing.public_key" unless options.ssh_fencing.public_key
Wait
options.wait_zookeeper_server = service.deps.zookeeper_server[0].options.wait
options.wait_hdfs_nn = service.deps.hdfs_nn_local.options.wait
Dependencies
{merge} = require '@nikitajs/core/lib/misc'