Hadoop HDFS NameNode Check
Check the health of the NameNode(s).
In HA mode, we need to ensure both NameNodes are installed before testing SSH Fencing. Otherwise, a race condition may occur if a host attempt to connect through SSH over another one where the public key isn't yet deployed.
module.exports = header: 'HDFS NN Check', handler: ({options}) ->
Wait
Wait for the HDFS NameNode to be started.
# TODO: replaced wait with assertion
@call once: true, 'ryba/hadoop/hdfs_nn/wait', options.wait
Check HTTP
protocol = if options.hdfs_site['dfs.http.policy'] is 'HTTP_ONLY' then 'http' else 'https'
nameservice = if options.nameservice then ".#{options.nameservice}" else ''
shortname = if options.nameservice then ".#{options.hostname}" else ''
address = options.hdfs_site["dfs.namenode.#{protocol}-address#{nameservice}#{shortname}"]
[_, port] = address.split ':'
securityEnabled = protocol is 'https'
@system.execute
retry: 2
header: 'HTTP'
cmd: mkcmd.hdfs options.hdfs_krb5_user, "curl --negotiate -k -u : #{protocol}://#{options.fqdn}:#{port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
, (err, obj) ->
throw err if err
data = JSON.parse obj.stdout
# After HDP2.2, the response needs some time before returning any beans
throw Error "Invalid Response" unless Array.isArray data?.beans
# throw Error "Invalid Response" unless /^Hadoop:service=NameNode,name=NameNodeStatus$/.test data?.beans[0]?.name
# throw Error "WARNING: Invalid security (#{data.beans[0].SecurityEnabled}, instead of #{securityEnabled}" unless data.beans[0].SecurityEnabled is securityEnabled
Check Health
Connect to the provided NameNode to check its health. The NameNode is capable of performing some diagnostics on itself, including checking if internal services are running as expected. This command will return 0 if the NameNode is healthy, non-zero otherwise. One might use this command for monitoring purposes.
Checkhealth return result is not completely implemented See More http://hadoop.apache.org/docs/r2.0.2-alpha/hadoop-yarn/hadoop-yarn-site/HDFSHighAvailability.html#Administrative_commands
@system.execute
header: 'HA Health'
if: -> options.nameservice
cmd: mkcmd.hdfs options.hdfs_krb5_user, "hdfs --config '#{options.conf_dir}' haadmin -checkHealth #{options.hostname}"
Check FSCK
Check for various inconsistencies on the overall filesystem. Use the command
hdfs fsck -list-corruptfileblocks
to list the corrupted blocks.
Corrupted blocks for removal can be found with the command:
hdfs fsck / | egrep -v '^\.+$' | grep -v replica | grep -v Replica
Additionnal information may be found on the CentOS HowTos site.
check_hdfs_fsck = if options.check_hdfs_fsck? then !!options.check_hdfs_fsck else true
@system.execute
header: 'FSCK'
retry: 3
wait: 60000
cmd: mkcmd.hdfs options.hdfs_krb5_user, "exec 5>&1; hdfs --config #{options.conf_dir} fsck / | tee /dev/fd/5 | tail -1 | grep HEALTHY 1>/dev/null"
if: options.force_check or check_hdfs_fsck
Check HDFS
Attemp to place a file inside HDFS. the file "/etc/passwd" will be placed at "/user/{test_user}/{fqnd}_dn".
@system.execute
header: 'HDFS'
cmd: mkcmd.test options.test_krb5_user, """
if hdfs --config '#{options.conf_dir}' dfs -test -f /user/#{options.test.user.name}/#{options.hostname}-nn; then exit 2; fi
echo 'Upload file to HDFS'
hdfs --config '#{options.conf_dir}' dfs -put /etc/passwd /user/#{options.test.user.name}/#{options.hostname}-nn
"""
code_skipped: 2
Check WebHDFS
Check the Kerberos SPNEGO and the Hadoop delegation token. Will only be executed if the file "/user/{test_user}/{host}_webhdfs" generated by this action is not present on HDFS.
Read Delegation Tokens in Hadoop Security for more information.
@call
header: 'WebHDFS Passive'
if: options.active_nn_host isnt options.fqdn
, ->
@system.execute
cmd: mkcmd.test options.test_krb5_user, """
curl -s --negotiate --insecure -u : "#{protocol}://#{address}/webhdfs/v1/user/#{options.test.user.name}?op=LISTSTATUS"
kdestroy
"""
, (err, data) ->
throw err if err
try
valid = JSON.parse(data.stdout).RemoteException.exception is 'StandbyException'
catch e then throw Error e
throw Error "Invalid result" unless valid
@call
header: 'WebHDFS Active'
if: options.active_nn_host is options.fqdn
, ->
protocol = if options.hdfs_site['dfs.http.policy'] is 'HTTP_ONLY' then 'http' else 'https'
nameservice = if options.nameservice then ".#{options.nameservice}" else ''
shortname = if options.nameservice then ".#{options.hostname}" else ''
address = options.hdfs_site["dfs.namenode.#{protocol}-address#{nameservice}#{shortname}"]
@system.execute
cmd: mkcmd.test options.test_krb5_user, """
hdfs --config '#{options.conf_dir}' dfs -touchz check-#{options.hostname}-webhdfs
kdestroy
"""
code_skipped: 2
@system.execute
cmd: mkcmd.test options.test_krb5_user, """
curl -s --negotiate --insecure -u : "#{protocol}://#{address}/webhdfs/v1/user/#{options.test.user.name}?op=LISTSTATUS"
kdestroy
"""
, (err, data) ->
throw err if err
try
count = JSON.parse(data.stdout).FileStatuses.FileStatus.filter((e) => e.pathSuffix is "check-#{options.hostname}-webhdfs").length
catch e then throw Error e
throw Error "Invalid result" unless count
@system.execute
cmd: mkcmd.test options.test_krb5_user, """
curl -s --negotiate --insecure -u : "#{protocol}://#{address}/webhdfs/v1/?op=GETDELEGATIONTOKEN"
kdestroy
"""
, (err, data) ->
throw err if err
json = JSON.parse data.stdout
return setTimeout do_tocken, 3000 if json.exception is 'RetriableException'
token = json.Token.urlString
@system.execute
cmd: """
curl -s --insecure "#{protocol}://#{address}/webhdfs/v1/user/#{options.test.user.name}?delegation=#{token}&op=LISTSTATUS"
"""
, (err, data) ->
throw err if err
try
count = JSON.parse(data.stdout).FileStatuses.FileStatus.filter((e) => e.pathSuffix is "check-#{options.hostname}-webhdfs").length
catch e then throw Error e
throw Error "Invalid result" unless count
Dependencies
mkcmd = require '../../lib/mkcmd'