目录
1 连接
- 我的 blog Kerberos简介、安装及与其它服务的集成和使用
- Kerberos 快速入手指南的官方文档译文 A guided tour of Kerberos: Tutorial
- Hadoop and Kerberos: The Madness beyond the Gate
- Hadoop in Secure Mode
2 KDC 安装
系统依赖部分
yum install -y libss # 在安装 krb5-server 报 libverto-module-base is needed by krb5-server-1.15.1-46.el7.x86_64 时安装 yum install -y libverto-libevent yum install -y logrotate
2.1 安装 Kerberos 服务
# 1 查看环境中是否已经安装存在 Kerberos rpm -qa | grep -E "krb5|libkadm5" # 删除时需要格外注意,为了版本统一一致,最好进行升级安装, # Kerberos 和 SSH 有些库是共用的,删除后可能会导致 SSH 无法使用,确实某些库文件 # 如果是 Docker 容器可以通过下面的命令将宿主机的文件拷贝到容器中,根据提示缺少的文件查找 # docker cp /usr/lib64/libss.so.2.0 dev_mysql_v1:/usr/lib64/libss.so.2.0 # rpm -e --nodeps xxx # 2 安装 Kerberos 依赖的包 查看环境是否有 words,若没有则安装 因为有些应用或数据库会使用这个来检查单词的拼写,或者密码检查器会使用这个来查找有误的密码。 rpm -qa | grep words 安装 words wget http://mirror.centos.org/centos/7/os/x86_64/Packages/words-3.0-22.el7.noarch.rpm 安装成功后会在 /usr/share/dict/words 有一个词文件。 rpm -ivh words-3.0-22.el7.noarch.rpm # 2 下载所需包 # CentOS 7.8.2003 默认版本为 krb5-libs-1.15.1-46 wget http://mirror.centos.org/centos/7/os/x86_64/Packages/krb5-libs-1.15.1-46.el7.x86_64.rpm wget http://mirror.centos.org/centos/7/os/x86_64/Packages/krb5-server-1.15.1-46.el7.x86_64.rpm wget http://mirror.centos.org/centos/7/os/x86_64/Packages/krb5-workstation-1.15.1-46.el7.x86_64.rpm wget http://mirror.centos.org/centos/7/os/x86_64/Packages/libkadm5-1.15.1-46.el7.x86_64.rpm # 3 安装 rpm -iUh krb5-libs-1.15.1-46.el7.x86_64.rpm rpm -ivh libkadm5-1.15.1-46.el7.x86_64.rpm rpm -ivh krb5-workstation-1.15.1-46.el7.x86_64.rpm rpm -ivh krb5-server-1.15.1-46.el7.x86_64.rpm
2.2 配置 /var/kerberos/krb5kdc/kdc.conf
[kdcdefaults] kdc_ports = 88 kdc_tcp_ports = 88 [realms] YORE.COM = {
# JDK 8 (至少在 jdk 1.8 _152 之前的)可能不支持,如果使用中发现异常:java.security.InvalidKeyException: Illegal key size, # 方法1,可以将 aes256-cts 去点,保留 aes128-cts # 方法2,或者下载官方提供的 jce_policy-8.zip 包,解压后将 local_policy.jar 和 US_export_policy.jar 覆盖JDK安装目录下的 jre\lib\security 下的两个文件 # 每个版本的路径可能稍微有差别,只要找到 unlimited 下的 # 下载地址 http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html # https://www.oracle.com/java/technologies/javase-jce-all-downloads.html #master_key_type = aes256-cts acl_file = /var/kerberos/krb5kdc/kadm5.acl dict_file = /usr/share/dict/words admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab max_renewable_life = 7d 0h 0m 0s #supported_enctypes = aes256-cts:normal aes128-cts:normal des3-hmac-sha1:normal arcfour-hmac:normal camellia256-cts:normal camellia128-cts:normal des-hmac-sha1:normal des-cbc-md5:normal des-cbc-crc:normal supported_enctypes = aes128-cts:normal des3-hmac-sha1:normal arcfour-hmac:normal camellia256-cts:normal camellia128-cts:normal des-hmac-sha1:normal des-cbc-md5:normal des-cbc-crc:normal }
2.3 配置 /var/kerberos/krb5kdc/kadm5.acl
Realme 改为上面配置的名字 YORE.COM。这样名称匹配 */ 的都会认为 admin,权限是 *,代表全部权限。
*/ *
2.4 配置 /etc/krb5.conf
其中 kdc.yore.com 为 KDC 服务的主机名,端口缺省时默认为 88,admin_server 端口缺省时默认为 749。
# Configuration snippets may be placed in this directory as well includedir /etc/krb5.conf.d/ [logging] default = FILE:/var/log/krb5libs.log kdc = FILE:/var/log/krb5kdc.log admin_server = FILE:/var/log/kadmind.log [libdefaults] default_realm = YORE.COM dns_lookup_realm = false dns_lookup_kdc = false ticket_lifetime = 24h renew_lifetime = 7d forwardable = true #rdns = false #pkinit_anchors = /etc/pki/tls/certs/ca-bundle.crt # default_realm = EXAMPLE.COM #default_ccache_name = KEYRING:persistent:%{uid} udp_preference_limit = 1 kdc_timeout = 3000 [realms] YORE.COM = {
kdc = yore.bigdata03.com:88 admin_server = yore.bigdata03.com:749 default_domain = YORE.COM } [domain_realm] .yore.bigdata03.com = YORE.COM yore.bigdata03.com = YORE.COM
2.5 创建 Kerberos 数据库
# 1 创建/初始化 Kerberos database # 当遇到问题,可能需要执行: /usr/sbin/kdb5_util -r CDH.COM -m destory -f。 # 删除 /var/kerberos/krb5kdc/principal* # # 期间会要求输入密码。kdc123 /usr/sbin/kdb5_util create -s -r YORE.COM # 2 查看生成的文件 # 前两个是我们前两步设置的,后面的 principal* 就是本次生成的 [root@kdc download]# ll /var/kerberos/krb5kdc/ total 24 -rw-r--r-- 1 root root 19 Mar 25 21:41 kadm5.acl -rw-r--r-- 1 root root 488 Mar 25 21:42 kdc.conf -rw------- 1 root root 8192 Mar 25 21:40 principal -rw------- 1 root root 8192 Mar 25 21:40 principal.kadm5 -rw------- 1 root root 0 Mar 25 21:40 principal.kadm5.lock -rw------- 1 root root 0 Mar 25 21:40 principal.ok
2.6 创建 Kerberos 管理员账号
# 这里会提示时输入管理员的密码(kdc123),再次确认,未报错则创建成功。 [root@kdc download]# /usr/sbin/kadmin.local -q "addprinc admin/" Authenticating as principal root/ with password. WARNING: no policy specified for admin/; defaulting to no policy Enter password for principal "admin/": Re-enter password for principal "admin/": Principal "admin/" created.
2.7 将 Kerberos 添加到自启动服务,并启动krb5kdc和kadmin服务
# 自启 systemctl enable krb5kdc systemctl enable kadmin # 启动 systemctl start krb5kdc systemctl start kadmin # 状态 systemctl status krb5kdc systemctl status kadmin
2.8 查看认证信息
# 1 提示输入密码时,输入 admin 的密码(Kerberos 管理员账号的密码: kdc123) [root@kdc download]# kinit admin/ Password for admin/: # 2 查看所有的 Principal /usr/sbin/kadmin.local -q "listprincs" # klist [root@kdc download]# klist Ticket cache: FILE:/tmp/krb5cc_0 Default principal: admin/ Valid starting Expires Service principal 2020-07-04T11:18:24 2020-07-05T11:18:24 krbtgt/ renew until 2020-07-11T11:18:24
3 Application Server
应用服务这里可以指代开启了 Kerberos 认证的大数据平台或者开启了 Kerberos 认证的大数据集群环境,这里主要介绍搭建一个带有 Kerberos 认证的 Apache Hadoop 和 Apache Hive ,因此这里的应用服务主要表示 Hadoop 和 Hive 所在的服务节点环境。
3.1 Kerberos 安装
# 1 资源 scp -P 30021 -r krb5-libs-1.15.1-46.el7.x86_64.rpm krb5-workstation-1.15.1-46.el7.x86_64.rpm \ libkadm5-1.15.1-46.el7.x86_64.rpm root@cdh3:/opt/download/ # 安装 rpm -iUh krb5-libs-1.15.1-46.el7.x86_64.rpm rpm -ivh libkadm5-1.15.1-46.el7.x86_64.rpm rpm -ivh krb5-workstation-1.15.1-46.el7.x86_64.rpm # 将 KDC 的 /etc/krb5.conf 拷贝到应用服务节点 /etc/ 下 scp /etc/krb5.conf root@bigdata01:
3.2 为应用服务节点创建 Principals 和 keytab
【说明】:这里为了简化,Hadoop 所有服务和 Hive 的服务都统一使用一个principal 主体和 一个 keytab 文件,真实生产环境最好每个服务对应一个principal,安装配置流程基本类似。
# 1 KDC 服务节点生成一个 hadoop 主体(kdc123) /usr/sbin/kadmin.local -q "addprinc hadoop/" # 2 导入 keytab 文件 /usr/sbin/kadmin.local -q "xst -k ./hadoop.keytab hadoop/" # 3 将 hadoop.keytab 发送到 Hadoop 节点(这一步下调到下一步解压 Hadoop 后执行) scp -P 22 hadoop.keytab root@bigdata01:/opt/installed/hadoop-3.0.3/etc/hadoop/ # 4 查看导入的 keytab 文件(注意文件权限) /usr/bin/klist -ket /opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab # 5 创建一个 Hadoop 应用用户,例如 hdfs 用户。下面均以此用户操作 useradd -s /bin/bash hdfs # 设置密码 passwd hdfs # 5 在 Hadoop 各节点获取缓存票据。那个用户操作 hadoop,那个用户执行认证 kinit -kt /opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab hadoop/ # 6 查看票据 klist
3.3 Apache Hadoop 安装部署
3.3.1 解压与配置
# 1 下载 wget http://archive.apache.org/dist/hadoop/common/hadoop-3.0.3/hadoop-3.0.3.tar.gz # 2 解压 tar -zxf hadoop-3.0.3.tar.gz -C /opt/installed # 为了节省空间,当不查看 Hadoop 文档是,可以删除掉 rm -rf /opt/installed/hadoop-3.0.3/share/doc # 3 SSH ssh-keygen -t rsa ssh-copy-id -i ~/.ssh/id_rsa.pub hdfs@bigdata01 # 4 配置环境变量(可以配置到用户环境变量下: vim ~/.bash_profile) # set hadoop environment export HADOOP_HOME=/opt/installed/hadoop-3.0.3 export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin # 5 创建所需文件夹 # root 用户创建,在将 /u01/hadoop 属组改为 hdfs mkdir -p /u01/hadoop/dfs/dn mkdir -p /u01/hadoop/dfs/nn mkdir -p /u01/hadoop/dfs/snn #mkdir -p /u01/hadoop/yarn/container-logs #mkdir -p /u01/hadoop/yarn/container-executor #mkdir -p /u01/hadoop/app/tmp/nm-local-dir chown -R hdfs:hdfs /u01/hadoop # 6 配置 hadoop-env.sh vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh 添加如下配置 # export JAVA_HOME=/usr/local/jdk1.8.0_231 export HADOOP_HOME=/opt/installed/hadoop-3.0.3 export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR" 配置用户信息,如果是 root 用户启动,需要配置为 root; #export HDFS_NAMENODE_USER=root #export HDFS_DATANODE_USER=root #export HDFS_SECONDARYNAMENODE_USER=root #export YARN_RESOURCEMANAGER_USER=root export YARN_NODEMANAGER_USER=root
3.3.2 配置 core-site.xml
vim $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration> <property> <name>fs.defaultFS
name> <value>hdfs://bigdata01:8020
value>
property>
<property> <name>fs.trash.interval
name> <value>60
value>
property> <property> <name>fs.trash.checkpoint.interval
name> <value>0
value>
property> <property> <name>hadoop.proxyuser.root.groups
name> <value>*
value>
property> <property> <name>hadoop.proxyuser.root.hosts
name> <value>*
value>
property> <property> <name>hadoop.proxyuser.hdfs.groups
name> <value>*
value>
property> <property> <name>hadoop.proxyuser.hdfs.hosts
name> <value>*
value>
property> <property> <name>hadoop.tmp.dir
name> <value>/tmp/hadoop-${user.name}
value>
property>
<property> <name>hadoop.security.authentication
name> <value>kerberos
value>
property> <property> <name>hadoop.security.authorization
name> <value>true
value>
property> <property> <name>hadoop.rpc.protection
name> <value>authentication
value>
property>
<property> <name>hadoop.security.auth_to_local
name> <value> RULE:[2:$1/$2@$0](hadoop/.)s/.*/hdfs/ RULE:[2:$1/$2@$0](hadoop/.)s/.*/yarn/ RULE:[2:$1/$2@$0](hadoop/.)s/.*/mapred/ DEFAULT
value>
property>
configuration>
3.3.3 配置 hdfs-site.xml
# 1 这里需要注意的是 Centos 7 非 root 用户无法使用 1024 一下的端口 方式一:修改为 1024 以上的端口 方式二:端口重定向 yum install iptables-services # 查看现有的iptables 规则 iptables -L -n # 可以关闭防火强,或者可以开启端口 # 将端口 x 重定向到 xx iptables -t nat -A PREROUTING -p tcp --dport x -j REDIRECT --to-port xx # 保存 iptables-save > /etc/sysconfig/iptables vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<configuration> <property> <name>dfs.namenode.name.dir
name> <value>file:///u01/hadoop/dfs/nn
value>
property> <property> <name>dfs.datanode.data.dir
name> <value>file:///u01/hadoop/dfs/dn
value>
property> <property> <name>dfs.namenode.checkpoint.dir
name> <value>file:///u01/hadoop/dfs/snn
value>
property>
<property> <name>dfs.replication
name> <value>1
value>
property> <property> <name>dfs.permissions
name> <value>false
value>
property>
<property> <name>dfs.http.policy
name> <value>HTTPS_ONLY
value>
property> <property> <name>dfs.namenode.https-address
name> <value>bigdata01:9871
value>
property>
<property> <name>dfs.namenode.http-address
name> <value>bigdata01:50070
value>
property> <property> <name>dfs.block.access.token.enable
name> <value>true
value>
property> <property> <name>dfs.namenode.kerberos.principal
name> <value>hadoop/
value>
property> <property> <name>dfs.namenode.keytab.file
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property> <property> <name>dfs.namenode.kerberos.internal.spnego.principal
name> <value>hadoop/
value>
property>
<property> <name>dfs.namenode.secondary.http-address
name> <value>bigdata01:9868
value>
property> <property> <name>dfs.secondary.https.address
name> <value>bigdata01:9869
value>
property> <property> <name>dfs.secondary.namenode.keytab.file
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property> <property> <name>dfs.secondary.namenode.kerberos.principal
name> <value>hadoop/
value>
property> <property> <name>dfs.secondary.namenode.kerberos.internal.spnego.principal
name> <value>hadoop/
value>
property>
<property> <name>dfs.datanode.data.dir.perm
name> <value>700
value>
property> <property> <name>dfs.datanode.address
name>
<value>0.0.0.0:1104
value>
property> <property> <name>dfs.datanode.http.address
name>
<value>0.0.0.0:1106
value>
property>
<property> <name>dfs.data.transfer.protection
name> <value>integrity
value>
property>
<property> <name>dfs.datanode.kerberos.principal
name> <value>hadoop/
value>
property> <property> <name>dfs.datanode.keytab.file
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property> <property> <name>dfs.encrypt.data.transfer
name> <value>false
value>
property>
<property> <name>dfs.web.authentication.kerberos.principal
name> <value>hadoop/
value>
property> <property> <name>dfs.web.authentication.kerberos.keytab
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property>
configuration>
3.3.4 配置 ssl
(1) 生成所需认证文件
mkdir /etc/https cd /etc/https # 生成ca。会提示输入密码,输入一个大于 6 位的密码(bigdata) openssl req -new -x509 -keyout hdfs_ca_key -out hdfs_ca_cert -days 3650 -subj '/C=CN/ST=beijing/L=chaoyang/O=yore/OU=dt/CN=yore.com' # 在每一条机器上生成 keystore,和trustores 生成 keystore keytool -keystore keystore -alias localhost -validity 3650 -genkey -keyalg RSA -keysize 2048 -dname "CN=${fqdn}, OU=DT, O=DT, L=CY, ST=BJ, C=CN" 添加 CA 到 truststore。是否信任此证书时输入 y keytool -keystore truststore -alias CARoot -import -file hdfs_ca_cert 从 keystore 中导出 cert keytool -certreq -alias localhost -keystore keystore -file cert 用 CA 对 cert 签名 openssl x509 -req -CA hdfs_ca_cert -CAkey hdfs_ca_key -in cert -out cert_signed -days 3650 -CAcreateserial 将 CA 的 cert 和用 CA 签名之后的 cert 导入 keystore keytool -keystore keystore -alias CARoot -import -file hdfs_ca_cert keytool -keystore keystore -alias localhost -import -file cert_signed 复制一份,带上后缀 cp keystore /etc/https/keystore.jks cp truststore /etc/https/truststore.jks
(2) 配置 ssl-client.xml
cp $HADOOP_HOME/etc/hadoop/ssl-client.xml.example $HADOOP_HOME/etc/hadoop/ssl-client.xml vim $HADOOP_HOME/etc/hadoop/ssl-client.xml
<configuration> <property> <name>ssl.client.truststore.location
name> <value>/etc/https/truststore.jks
value> <description>Truststore to be used by clients like distcp. Must be specified.
description>
property> <property> <name>ssl.client.truststore.password
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.client.truststore.type
name> <value>jks
value> <description>Optional. The keystore file format, default value is "jks".
description>
property> <property> <name>ssl.client.truststore.reload.interval
name> <value>10000
value> <description>Truststore reload check interval, in milliseconds.Default value is 10000 (10 seconds).
description>
property> <property> <name>ssl.client.keystore.location
name> <value>/etc/https/keystore.jks
value> <description>Keystore to be used by clients like distcp. Must be specified.
description>
property> <property> <name>ssl.client.keystore.password
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.client.keystore.keypassword
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.client.keystore.type
name> <value>jks
value> <description>Optional. The keystore file format, default value is "jks".
description>
property>
configuration>
(3) 配置 ssl-server.xml
cp $HADOOP_HOME/etc/hadoop/ssl-server.xml.example $HADOOP_HOME/etc/hadoop/ssl-server.xml vim $HADOOP_HOME/etc/hadoop/ssl-server.xml
<configuration> <property> <name>ssl.server.truststore.location
name> <value>/etc/https/truststore.jks
value> <description>Truststore to be used by NN and DN. Must be specified.
description>
property> <property> <name>ssl.server.truststore.password
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.server.truststore.type
name> <value>jks
value> <description>Optional. The keystore file format, default value is "jks".
description>
property> <property> <name>ssl.server.truststore.reload.interval
name> <value>10000
value> <description>Truststore reload check interval, in milliseconds.Default value is 10000 (10 seconds).
description>
property> <property> <name>ssl.server.keystore.location
name> <value>/etc/https/keystore.jks
value> <description>Keystore to be used by NN and DN. Must be specified.
description>
property> <property> <name>ssl.server.keystore.password
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.server.keystore.keypassword
name> <value>bigdata
value> <description>Optional. Default value is "".
description>
property> <property> <name>ssl.client.keystore.type
name> <value>jks
value> <description>Optional. The keystore file format, default value is "jks".
description>
property>
configuration>
3.3.5 配置 mapred-site.xml
vim $HADOOP_HOME/etc/hadoop/mapred-site.xml
<configuration> <property> <name>mapreduce.framework.name
name> <value>yarn
value>
property> <property> <name>mapreduce.jobhistory.webapp.address
name> <value>bigdata01:19888
value>
property> <property> <name>mapreduce.jobhistory.webapp.https.address
name> <value>bigdata01:19890
value>
property>
<property> <name>mapreduce.jobhistory.address
name> <value>bigdata01:10020
value>
property> <property> <name>mapreduce.jobhistory.keytab
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property> <property> <name>mapreduce.jobhistory.principal
name> <value>hadoop/
value>
property>
configuration>
3.3.6 配置 yarn-site.xml
vim $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration>
<property> <name>yarn.resourcemanager.hostname
name> <value>bigdata01
value>
property> <property> <name>yarn.nodemanager.aux-services
name> <value>mapreduce_shuffle
value>
property> <property> <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class
name> <value>org.apache.hadoop.mapred.ShuffleHandler
value>
property>
<property> <name>yarn.nodemanager.local-dirs
name> <value>${hadoop.tmp.dir}/nm-local-dir
value>
property> <property> <name>yarn.log-aggregation-enable
name> <value>true
value>
property> <property> <name>yarn.log.server.url
name> <value>http://bigdata01:19888/jobhistory/logs/
value>
property> <property> <name>yarn.nodemanager.vmem-check-enabled
name> <value>false
value>
property> <property> <name>yarn.application.classpath
name> <value> $HADOOP_HOME/etc/hadoop, $HADOOP_HOME/share/hadoop/common/*, $HADOOP_HOME/share/hadoop/common/lib/*, $HADOOP_HOME/share/hadoop/hdfs/*, $HADOOP_HOME/share/hadoop/hdfs/lib/*, $HADOOP_HOME/share/hadoop/mapreduce/*, $HADOOP_HOME/share/hadoop/mapreduce/lib/*, $HADOOP_HOME/share/hadoop/yarn/*, $HADOOP_HOME/share/hadoop/yarn/lib/*
value>
property>
<property> <name>yarn.resourcemanager.principal
name> <value>hadoop/
value>
property> <property> <name>yarn.resourcemanager.keytab
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property> <property> <name>yarn.resourcemanager.webapp.https.address
name> <value>${yarn.resourcemanager.hostname}:8090
value>
property>
<property> <name>yarn.nodemanager.principal
name> <value>hadoop/
value>
property> <property> <name>yarn.nodemanager.keytab
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value>
property>
<property> <name>yarn.nodemanager.webapp.https.address
name> <value>0.0.0.0:8044
value>
property>
configuration>
3.3.7 初始化 Hadoop 及启停
# 启动前修改下工作节点配置。将集群的每个节点 hostname 写入下面文件中,这里仅写入 bigdata01 vim $HADOOP_HOME/etc/hadoop/workers # 1 初始化 $HADOOP_HOME/bin/hdfs namenode -format # 2 启动 Hadoop # $HADOOP_HOME/sbin/start-all.sh $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh # 3 停止 Hadoop $HADOOP_HOME/sbin/stop-all.sh
3.3.8 测试及常用命令
# 1 测试集群可用性。 hadoop fs -mkdir -p /tmp/input hadoop fs -put $HADOOP_HOME/README.txt /tmp/input export hadoop_version=`hadoop version | head -n 1 | awk '{print $2}'` hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-$hadoop_version.jar wordcount /tmp/input /tmp/output hadoop fs -tail /tmp/output/part-r-00000 # 2 查看 hdfs name hdfs getconf -confKey fs.default.name # hdfs 汇总 hdfs dfsadmin -report
3.4 Apache Hive 安装部署
3.4.1 配置前
# 1 下载 # 注意:因为我们 Hadoop 安装的是 3.0.x 版本的,所以下载Hive 时一定要下载支持 Hadoop 3.0 的, # 例如 Hive 2.1.1 就无法支持 3.0.x 版本 # wget http://archive.apache.org/dist/hive/hive-2.1.1/apache-hive-2.1.1-bin.tar.gz wget http://archive.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz tar -zxf apache-hive-3.1.2-bin.tar.gz -C /opt/installed/ cd /opt/installed/apache-hive-3.1.2-bin # 2 配置环境变量 # set hive environment export HIVE_HOME=/opt/installed/apache-hive-3.1.2-bin export PATH=$PATH:$HIVE_HOME/bin # 3 创建元数据库,这里以 MySQL 为例 mysql> create database metastore1; # 4 添加Mysql驱动到$HIVE_HOME/lib 下 wget https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.47/mysql-connector-java-5.1.47.jar -P $HIVE_HOME/lib/ # 5 从template中复制出一份 hive 的配置文件 cd $HIVE_HOME/conf cp hive-env.sh.template hive-env.sh cp hive-default.xml.template hive-site.xml cp hive-log4j2.properties.template hive-log4j2.properties cp hive-exec-log4j2.properties.template hive-exec-log4j2.properties cp beeline-log4j2.properties.template beeline-log4j2.properties # 6 添加 Hive principal 和 keytab 文件。 # 因为这是一个测试环境,为例简化,我们继续使用 hadoop 的认证文件 # 因为节点的认证票据已经获取,所以这里略过获取票据的步骤 # 7 hdfs 中创建仓库目录 hadoop fs -mkdir -p /user/hive/warehouse
3.4.2 修改hive-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_231 export HADOOP_HOME=/opt/installed/hadoop-3.0.3 export HIVE_HOME=/opt/installed/apache-hive-3.1.2-bin export HIVE_CONF_DIR=$HIVE_HOME/conf
3.4.3 修改hive-site.xml
主要修改如下配置项,其它可默认。同时注意配置文件的乱码(最好将 配置文件中大概 3215 行中的 去掉)。
<configuration> <property> <name>hive.metastore.warehouse.dir
name> <value>/user/hive/warehouse
value> <description>Hive 数据存储的路径(在 HDFS 上的),也可可修改为其它路径。在建表时如果不指定位置,默认会将数据保存在这里
description>
property>
<property> <name>hive.server2.thrift.bind.host
name> <value>bigdata01
value> <description>Bind host on which to run the HiveServer2 Thrift service.
description>
property> <property> <name>hive.metastore.uris
name>
<value>thrift://bigdata01:9083
value> <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.
description>
property>
<property> <name>javax.jdo.option.ConnectionURL
name> <value>jdbc:mysql://bigdata01:3306/metastore1?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=utf8&useSSL=false
value> <description>JDBC connect string for a JDBC metastore. To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL. For example, jdbc:postgresql://myhost/db?ssl=true for postgres database.
description>
property>
<property> <name>javax.jdo.option.ConnectionDriverName
name> <value>com.mysql.jdbc.Driver
value> <description>Driver class name for a JDBC metastore
description>
property>
<property> <name>javax.jdo.option.ConnectionUserName
name> <value>root
value> <description>Username to use against metastore database
description>
property>
<property> <name>javax.jdo.option.ConnectionPassword
name> <value>
value> <description>password to use against metastore database
description>
property>
<property> <name>datanucleus.schema.autoCreateAll
name> <value>true
value> <description>生产环境建议关闭,测试环境或者前期可以开启,后期稳定后设置为 false
description>
property> <property> <name>hive.metastore.schema.verification
name> <value>false
value> <description> Enforce metastore schema version consistency. True: Verify that version information stored in is compatible with one from Hive jars. Also disable automatic schema migration attempt. Users are required to manually migrate schema after Hive upgrade which ensures proper metastore schema migration. (Default) False: Warn if the version information stored in metastore doesn't match with one from in Hive jars.
description>
property>
<property> <name>hive.exec.local.scratchdir
name> <value>/tmp/hive/exec/${user.name}
value> <description>Local scratch space for Hive jobs
description>
property> <property> <name>hive.downloaded.resources.dir
name> <value>/tmp/hive/${hive.session.id}_resources
value> <description>Temporary local directory for added resources in the remote file system.
description>
property> <property> <name>hive.querylog.location
name> <value>/tmp/hive/${user.name}
value> <description>Location of Hive run time structured log file
description>
property> <property> <name>hive.server2.logging.operation.log.location
name> <value>/tmp/hive/server2/${user.name}/operation_logs
value> <description>Top level directory where operation logs are stored if logging functionality is enabled
description>
property>
<property> <name>hive.exec.dynamic.partition.mode
name> <value>nonstrict
value> <description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions. In nonstrict mode all partitions are allowed to be dynamic.
description>
property>
<property> <name>hive.server2.authentication
name> <value>KERBEROS
value> <description> Expects one of [nosasl, none, ldap, kerberos, pam, custom]. Client authentication types. NONE: no authentication check LDAP: LDAP/AD based authentication KERBEROS: Kerberos/GSSAPI authentication CUSTOM: Custom authentication provider (Use with property hive.server2.custom.authentication.class) PAM: Pluggable authentication module NOSASL: Raw transport
description>
property> <property> <name>hive.server2.authentication.kerberos.principal
name> <value>hadoop/
value> <description>Kerberos server principal
description>
property> <property> <name>hive.server2.authentication.kerberos.keytab
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value> <description>Kerberos keytab file for server principal
description>
property> <property> <name>hive.metastore.sasl.enabled
name> <value>true
value> <description>If true, the metastore Thrift interface will be secured with SASL. Clients must authenticate with Kerberos.
description>
property> <property> <name>hive.metastore.kerberos.keytab.file
name> <value>/opt/installed/hadoop-3.0.3/etc/hadoop/hadoop.keytab
value> <description>The path to the Kerberos Keytab file containing the metastore Thrift server's service principal.
description>
property> <property> <name>hive.metastore.kerberos.principal
name> <value>hadoop/
value> <description> The service principal for the metastore Thrift server. The special string _HOST will be replaced automatically with the correct host name.
description>
property>
configuration>
3.4.4 初始化 Hive及启动
# 1 初始化 Hive 元数据。 # 这一步会在 Mysql 的 metastore 库下初始化的表。(hdfs 用户执行) # 执行成功后,在 MySQL 的 metastore1 库下会初始化生成元数据表 $HIVE_HOME/bin/schematool -dbType mysql -initSchema # 2 启动 hive --service metastore >/dev/null 2>&1 & hive --service hiveserver2 >/dev/null 2>&1 & # 3 使用 beeline 连接 Hive beeline --color=true -d "org.apache.hive.jdbc.HiveDriver" \ -u "jdbc:hive2://bigdata01:10000/default;principal=hadoop/"
3.4.5 中文注释乱码
中文注释乱码可以进入到 MySQL 中执行
use hive_metastore; select * from COLUMNS_V2; -- 查看 COLUMNS_V2 建表语句,可以发现 ENGINE=InnoDB DEFAULT CHARSET=latin1 ,使用的是 latin1 字符。注意:这个表可能随版本名 字会不一样,先查找一下 COLLUMNS 开头的表 show create table COLUMNS_V2; alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8; alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8; alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8; alter table DBS modify column `DESC` varchar(4000) character set utf8;
3.4.6 小例子
使用 Beeline 登陆 hive(关于 Beeline 的更详细使用可查看我的另一篇 blog Beeline 的进阶使用),创建表,并插入数据,执行简单 SQL,测试 hive 是否可正常使用。
-- 1 建表 0: jdbc:hive2://bigdata02:10000/default> create table if not exists person (id int, name varchar(32) comment '姓名', age int); -- 2 查看键表信息。可以发现中文注释没有出现乱码 0: jdbc:hive2://bigdata02:10000/default> show create table person; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `person`( | | `id` int, | | `name` varchar(32) COMMENT '姓名', | | `age` int) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.mapred.TextInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | | LOCATION | | 'hdfs://bigdata02:8020/user/hive/warehouse/person' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'transient_lastDdlTime'='') | +----------------------------------------------------+ 15 rows selected (0.145 seconds) -- 3 插入数据 0: jdbc:hive2://bigdata02:10000/default> insert into person values(100, "小兰", 18),(102, "yore", 20); -- 4 查询表数据 0: jdbc:hive2://bigdata02:10000/default> SELECT * FROM person; +------------+--------------+-------------+ | person.id | person.name | person.age | +------------+--------------+-------------+ | 100 | 小兰 | 18 | | 102 | yore | 20 | +------------+--------------+-------------+ 2 rows selected (0.236 seconds)
4 Client
4.1 Windows 环境Kerberos 环境
注意:Windows 系统和 Linux 的文件路径分隔符不同,这里在配置文件中一定要使用 / 作为文件分隔符。
# 1 下载。访问 http:/4.1.2 Linux 环境 /web.mit.edu/kerberos/dist/ wget http://web.mit.edu/kerberos/dist/kfw/4.1/kfw-4.1-amd64.msi # 2 直接双击下载 kfw-4.1-amd64.msi 进行安装 # 3 安装完毕后会提示重启,直接重启系统 # 安装完毕后会自动安装到 C:\Program Files\MIT\Kerberos\bin 下 # 4 krb5.ini # 将 KDC 或 应用服务节点的 /etc/krb5.conf 下载到 windowns 下 # 将 krb5.conf 重命名为 krb5.ini 放置到 C:\ProgramData\MIT\Kerberos5 # 注意:krb5.ini 的路径和安装的路径不同,不要放错了 # 在 win 系统的用户环境变量用户中添加。 # 注意 C:\temp 必须存在,但是其下面的 krb5cache 是认证通过后自动创建的缓存文件 变量名(N): KRB5_CONFIG 变量值(V): C:\ProgramData\MIT\Kerberos5\krb5.ini 变量名(N): KRB5CCNAME 变量值(V): C:\temp\krb5cache # 5 配置 hosts # 将服务的 /etc/hosts 中的 ip 添加到 C:\Windows\System32\drivers\etc\hosts 中 # 主要是用到的应用服务的 ip 和 KDC 中配置的 ip # 6 环境变量 # klist 和 kinit 在 JDK 中也带有,如果环境中已经配置了 JDK , # path 中添加 C:\Program Files\MIT\Kerberos\bin 一定要上移到最靠前 # case 1:当 jdk 的 klist 生效时,输入 klist 提示 当前登录 ID 是 0:0x80ea4 …… # case 2:当 Kerberos 的 klist 生效时,输入 klist 提示 klist: No credentials cache file found # 7 将生成的 testuser.keytab 拷贝到客户端节点。 # 例如放到家目录下的 C:\ProgramData\MIT\Kerberos5\testuser.keytab 文件夹下 # 8 查看 keytab 文件信息 # 在 C:\Program Files\MIT\Kerberos 下执行。如果配了环境变量任意地方执行都行 .\bin\klist.exe -ket C:\ProgramData\MIT\Kerberos5\testuser.keytab # 9 获取认证的 Ticket .\bin\kinit.exe -kt C:\ProgramData\MIT\Kerberos5\testuser.keytab testuser/ # 10 查看本地缓存的认证票据 klist # 也可以通过 Kerberos 客户端工具 C:\Program Files\MIT\Kerberos\bin\MIT Kerberos.exe 查看
4.2 Linux 环境Kerberos 环境
Linux 下的安装同 应用服务节点安装类似
# 1查看环境是否已经安装了 Kerberos 客户端环境 rpm -qa | grep -E "krb5|libkadm5" # 2下载所需 Kerberos 客户端包 wget http://mirror.centos.org/centos/7/os/x86_64/Packages/krb5-libs-1.15.1-46.el7.x86_64.rpm wget http://mirror.centos.org/centos/7/os/x86_64/Packages/krb5-workstation-1.15.1-46.el7.x86_64.rpm wget http://mirror.centos.org/centos/7/os/x86_64/Packages/libkadm5-1.15.1-46.el7.x86_64.rpm # 3安装 rpm -iUh krb5-libs-1.15.1-46.el7.x86_64.rpm rpm -ivh libkadm5-1.15.1-46.el7.x86_64.rpm rpm -ivh krb5-workstation-1.15.1-46.el7.x86_64.rpm # 4 获取 /etc/krb5.conf # 将 KDC 或者应用服务器上的 /etc/krb5.conf 拷贝到 客户端机器 的 /etc 下即可 # 5 hosts配置, 按照 一、环境 节进行配置,主要是将 CDH 的ip 和 hostname 进行映射配置,注意内网和外网的区别 Linux 环境配置:/etc/hosts # 6 查看导入的 keytab 文件(注意文件权限) /usr/bin/klist -ket ~/testuser.keytab # 7 获取认证票据 kinit -kt ~/testuser.keytab testuser/ # 8 查看 获取的缓存票据 klist
4.3 DBeaver
下面将以 Windows 系统环境为例。
4.3.1 下载及解压
访问 https://dbeaver.io/download/ 进行下载,例如下载 Windows 免安装版的 Windows 64 bit (zip)
wget https://dbeaver.io/files/dbeaver-ce-latest-win32.win32.x86_64.zip # 解压。因为是免安装的,直接解压后即可使用 unzip dbeaver-ce-latest-win32.win32.x86_64.zip
4.3.2 修改 dbeaver.ini
# 在 dbeaver/dbeaver.ini 中添加如下参数 -Djavax.security.auth.useSubjectCredsOnly=false -Djava.security.krb5.conf=C:\ProgramData\MIT\Kerberos5\krb5.ini # 开启krb5 调试日志,可以在用户家目录下的 dbeaver 目录中 dbeaver-debug.log 日志文件 -Dsun.security.krb5.debug=true #-Djava.security.auth.login.config=C:\ProgramData\MIT\Kerberos5\jaas.conf
4.3.3 获取 hive 驱动 – 通过 Maven 项目导出Hive JDBC 驱动包
前提,系统中已经安装了 JDK、Maven 环境。在某目录下新建一个文件夹作为项目根目录,在这个目录下新建 pom.xml 文件,其中主要配置如下,项目的 GAV 值可以根据自己感觉随意写的,重点是其中引入自己所需版本的 hive-jdbc 依赖,例如引入3.1.2 版本的 Hive jdbc。
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0
modelVersion> <artifactId>dbeaver-hive-driver
artifactId> <groupId>com.yore
groupId> <version>1.0.0-SNAPSHOT
version> <packaging>jar
packaging> <dependencies>
<dependency> <groupId>org.apache.hive
groupId> <artifactId>hive-jdbc
artifactId> <version>3.1.2
version>
dependency>
dependencies>
project>
接着执行如下命令导入 hive jdbc 的依赖到某个文件下
mvn dependency:copy-dependencies -DoutputDirectory=./output # 为了方便传输,可以将其打包 tar -zcf dbeaver-hive-driver.tar.gz ./output
打开 DBeave,新建Hive连接,编辑驱动,主要配置如下项:
- URL 模板: jdbc:hive2://{host}[:{port}][/{database}];principal=hive/{host}@SINOSIG.COM
- 添加文件夹:上一步导出的文件夹
- 找驱动: org.apache.hive.jdbc.HiveDriver

4.3.4 获取 hive 驱动 – 下载通用Hive驱动包
直接下载 DBeaver 默认推荐的驱动包 hive-jdbc-uber-2.6.5.0-292.jar ,包加载导入,创建 Hive 连接时,连接成功如下所示(注意 principal=hadoop/{host}@YORE.COM,hadoop 为 hive-site.xml 中配置的服务表示,不能更换为其它)。
- JDBC URL: jdbc:hive2://bigdata01:10000/default;principal=hadoop/
- 主机: bigdata01
- 端口: 10000
- 数据库/模式:default

4.3.5 Clock skew too great(37) – PROCESS_TGS
4.4 Kettle
4.1.1 版本说明
下面主要以 kettle 6.1 (Pentaho 6.1)版本为例,在 kettle 7.1、kettle 8.2版本测试也适用,只不过需要将 hive-jdbc 驱动包 放到 data-integration/lib下一份,否则会报如下的错误
org.pentaho.di.core.exception.KettleDatabaseException: Error occurred while trying to connect to the database Error connecting to database: (using class org.apache.hive.jdbc.HiveDriver) Illegal Hadoop Version: Unknown (expected A.B.* format)
如果遇到其它问题可以详细查看官方文档(最新版是 9.0,查看其它版本的文档选择对应的版本查看即可),
- Documentation
- Set_Up_Kerberos_for_Pentaho
- Advanced_settings_for_connecting_to_a_Cloudera_cluster
- JDBC_drivers_reference
4.4.2 下载及其大数据插件配置
# 1 下载(访问下面连接,选择一个版本下载) # 或者访问 https://sourceforge.net/projects/pentaho/files/Data%20Integration/ # 这里推荐 访问 http://mirror.bit.edu.cn/pentaho/ # 2 下载 6.1 版本 # 访问 http://mirror.bit.edu.cn/pentaho/Data%20Integration/6.1/,下载 6.1 版本压缩包 wget http://mirror.bit.edu.cn/pentaho/Data%20Integration/6.1/pdi-ce-6.1.0.1-196.zip # 3 解压 # 解压出后为 data-integration 文件夹, unzip pdi-ce-6.1.0.1-196.zip # 4 pentaho-big-data-plugin # 关于大数据支持的插件主要为 data-integration/plugins/pentaho-big-data-plugin # 4.1 配置使用的 Hadoop 版本 修改 data-integration/plugins/pentaho-big-data-plugin/plugin.properties 在其下面的 hadoop-configurations 文件夹下可以看大可选的有:cdh55、emr310、hdp23、mapr410 这里以 cdh55 为例,在 plugin.properties 中配置,重点关注下面两个配置项,否者例如连接Hive 时会找不到合适的 Driver active.hadoop.configuration=cdh55 hadoop.configurations.path=hadoop-configurations # 4.2 访问 Hadoop 时首先用到配置文件 core-site.xml, # 将上产环境的 $HADOOP_HOME/etc/hadoop/core-site.xml # 放到 data-integration/plugins/pentaho-big-data-plugin/hadoop-configurations/cdh55 下 # 注意编码格式,统一为 UTF-8 # 5 启动 启动前请按照下面的系统,按照下面的方式修改修改对应的脚本,然后启动 4.1 Win 系统双加执行 data-integration\Spoon.bat 4.2 Linux 系统 sh data-integration/spoon.sh
4.4.3 修改Kettle 启动脚本
这里需要注意的,访问应用服务环境 hive 的主体 principal 统一为 hadoop/,而不是用户自己的principal ,用户自己的 principal 仅作为客户端机器获取认证票据。
# 1 创建文件 kettle.login,写入如下内容并保存 # 例如:Windows 放置在 C:/ProgramData/MIT/Kerberos5/kettle.login # Linux 放置在 /etc/kettle.login com.sun.security.jgss.initiate{
com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true useTicketCache=false keyTab="C:/ProgramData/MIT/Kerberos5/testuser.keytab" principal="hadoop/" doNotPrompt=true debug=true debugNative=true; }; # 2 修改 Kettle 启动脚本 2.1 win 系统修改 data-integration\Spoon.bat # 对大概 96 行左右的 OPT 变量修改为如下: # 主要添加了如下四个参数(注意每个参数用引号引起,空格分开,如果想开启动的信息,在启动脚本最后一行加上 pause): # "-Djava.security.auth.login.config=C:/ProgramData/MIT/Kerberos5/kettle.login" # "-Djava.security.krb5.realm=YORE.COM" # "-Djava.security.krb5.kdc=192.168.33.9" # "-Djavax.security.auth.useSubjectCredsOnly=false" set OPT=%OPT% %PENTAHO_DI_JAVA_OPTIONS% "-Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2" "-Djava.library.path=%LIBSPATH%" "-Djava.security.auth.login.config=C:/ProgramData/MIT/Kerberos5/kettle.login" "-Djava.security.krb5.realm=YORE.COM" "-Djava.security.krb5.kdc=192.168.33.9" "-Djavax.security.auth.useSubjectCredsOnly=false" "-DKETTLE_HOME=%KETTLE_HOME%" "-DKETTLE_REPOSITORY=%KETTLE_REPOSITORY%" "-DKETTLE_USER=%KETTLE_USER%" "-DKETTLE_PASSWORD=%KETTLE_PASSWORD%" "-DKETTLE_PLUGIN_PACKAGES=%KETTLE_PLUGIN_PACKAGES%" "-DKETTLE_LOG_SIZE_LIMIT=%KETTLE_LOG_SIZE_LIMIT%" "-DKETTLE_JNDI_ROOT=%KETTLE_JNDI_ROOT%" 2.2 Linux 系统修改 data-integration/spoon.sh 大概在 201 行,同 Windows 系统类似,添加四个参数 OPT="$OPT $PENTAHO_DI_JAVA_OPTIONS -Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2 -Djava.library.path=$LIBPATH -Djava.security.auth.login.config=/etc/kettle.login -Djava.security.krb5.realm=YORE.COM -Djava.security.krb5.kdc=192.168.33.9 -Djavax.security.auth.useSubjectCredsOnly=false -DKETTLE_HOME=$KETTLE_HOME -DKETTLE_REPOSITORY=$KETTLE_REPOSITORY -DKETTLE_USER=$KETTLE_USER -DKETTLE_PASSWORD=$KETTLE_PASSWORD -DKETTLE_PLUGIN_PACKAGES=$KETTLE_PLUGIN_PACKAGES -DKETTLE_LOG_SIZE_LIMIT=$KETTLE_LOG_SIZE_LIMIT -DKETTLE_JNDI_ROOT=$KETTLE_JNDI_ROOT"
4.4.4 连接 Hive
- 连接名称: by_yourself
- 连接类型: Hadoop Hive 2
- 主机名称: hiveserver2 服务地址,例如:bigdata01 或 192.168.33.3
- 数据库名称: default;principal=hadoop/
- 端口号: 10000
- 用户名:
- 密码:
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/208767.html原文链接:https://javaforall.net
