hive 2.0 安装配置

来源:互联网 发布:诺基亚s40v3软件下载 编辑:程序博客网 时间:2024/05/16 19:43


1. Hive的安装
系统环境
装好hadoop的环境后,我们可以把Hive装在namenode机器上(c1)。
hadoop的环境,请参考Hadoop环境搭建


下载: apache-hive-2.0.0-bin.tar.gz 
解压到: /home/hadoop/hadoop/hive


hive配置


~ cd /home/hadoop/hadoop/hive/conf
~ cp hive-default.xml.template hive-site.xml
~ cp hive-log4j2.properties.template hive-log4j2.properties


修改hive-site.xml配置文件
把Hive的元数据存储到MySQL中


~ vi conf/hive-site.xml


<configuration>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs://v001:9000/usr/hive/warehouse</value>
</property>
<property>
<name>hive.exec.scratchdir</name>
<value>hdfs://v001:9000/usr/hive/warehouse</value>
</property>
<property>
<name>hive.querylog.location</name>
<value>/usr/hive/log</value>
</property>
<property> 
<name>datanucleus.autoCreateSchema</name> 
<value>true</value> 
</property>
<property>
<name>datanucleus.autoCreateTables</name>
<value>true</value>
</property>
<property>
<name>datanucleus.autoCreateColumns</name>
<value>true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://v002:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property> 
<name>javax.jdo.option.ConnectionDriverName</name> 
 <value>com.mysql.jdbc.Driver</value> 
<description>驱动名</description> 
 </property> 
<property> 
 <name>javax.jdo.option.ConnectionUserName</name> 
<value>hive</value> 
 <description>用户名</description> 
 </property> 
 <property> 
<name>javax.jdo.option.ConnectionPassword</name> 
 <value>hive</value> 
<description>密码</description> 
</property> 
<property> 
 <name>hive.metastore.warehouse.dir</name> 
<value>/usr/hive/warehouse</value> 
 <description>数据路径(相对hdfs)</description> 
 </property>
 <property> 
 <property>
 <name>hive.exec.scratdir</name>
 <value>/usr/hive/tmp</value>
 </property>
 <property>
 <name>hive.querylog.location</name>
 <value>/usr/hive/log</value>
 </property>
<name>hive.metastore.uris</name> 
 <value>thrift://v001:9083</value> 
<description>运行hive的主机地址及端口</description> 
</property> 
</configuration>




设置环境变量


~ sudo vi /etc/environment

PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/home/hadoop/hive/toolkit/ant184/bin:/home/hadoop/hive/toolkit/jdk16/bin:/home/hadoop/hive/toolkit/maven3/bin:/home/hadoop/hive/toolkit/hadoop-1.0.3/bin:/home/hadoop/hive/toolkit/hive-0.9.0/bin"


export JAVA_HOME=/opt/jdk1.7.0_55
export HADOOP_HOME=/home/hadoop/hadoop
export HIVE_HOME=/home/hadoop/hive
export CLASS_PATH=$CALSSPATH:$HIVE_HOME/lib
export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH:$HIVE_HOME/bin


在hdfs上面,创建目录

$HADOOP_HOME/bin/hadoop fs -mkidr /tmp
$HADOOP_HOME/bin/hadoop fs -mkidr /usr/hive/warehouse
$HADOOP_HOME/bin/hadoop fs -mkidr /usr/hive/log
$HADOOP_HOME/bin/hadoop fs -mkidr /usr/hive/tmp
$HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
$HADOOP_HOME/bin/hadoop fs -chmod g+w /usr/hive/warehouse
$HADOOP_HOME/bin/hadoop fs -chmod g+w /usr/hive/log
$HADOOP_HOME/bin/hadoop fs -chmod g+w /usr/hive/tmp


在MySQL中创建数据库

create database hive;
grant all on hive.* to hive@'%' identified by 'hive';
grant all on hive.* to hive@localhost identified by 'hive';
ALTER DATABASE hive CHARACTER SET latin1;


手动上传mysql的jdbc库到hive/lib


~ ls /home/hadoop/hive/lib
mysql-connector-java-5.1.17.jar


启动hive

#启动metastore服务
~ bin/hive --service metastore &
Starting Hive Metastore Server


ps ax|grep metastore

#启动hiveserver服务
~ bin/hive --service hiveserver2 &
Starting Hive Thrift Server

ps ax|grep HiveServer2


#启动hive客户端
~ bin/hive shell
Logging initialized using configuration in file:/root/hive-0.9.0/conf/hive-log4j.properties
Hive history file=/tmp/root/hive_job_log_root_201211141845_1864939641.txt


hive> show tables
OK
查询MySQL数据库中的元数据


~ mysql -h v002 -u hive -p
mysql> use hive;
Database changed


mysql> show tables;
+---------------------------+
| Tables_in_hive            |
+---------------------------+
| bucketing_cols            |
| cds                       |
| columns_v2                |
| database_params           |
| dbs                       |
| func_ru                   |
| funcs                     |
| global_privs              |
| part_col_stats            |
| partition_key_vals        |
| partition_keys            |
| partition_params          |
| partitions                |
| roles                     |
| sd_params                 |
| sds                       |
| sequence_table            |
| serde_params              |
| serdes                    |
| skewed_col_names          |
| skewed_col_value_loc_map  |
| skewed_string_list        |
| skewed_string_list_values |
| skewed_values             |
| sort_cols                 |
| tab_col_stats             |
| table_params              |
| tbl_privs                 |
| tbls                      |
| version                   |
+---------------------------+
30 rows in set (0.00 sec)


Hive已经成功安装,下面是hive的使用攻略。


2. Hive的基本使用
1. 进入hive控制台




~ cd /home/hadoop/hive


~ bin/hive shell
Logging initialized using configuration in file:/home/hadoop/hive/toolkit/hive-0.9.0/conf/hive-log4j.properties
Hive history file=/tmp/cos/hive_job_log_cos_201307160003_95040367.txt
hive>
新建表


#创建数据(文本以tab分隔)
~ vi /home/hadoop/hive/demo/t_hive.txt

16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34


#创建新表
hive> CREATE TABLE t_hive (a int, b int, c int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
OK
Time taken: 0.489 seconds


#导入数据t_hive.txt到t_hive表
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hive/demo/t_hive.txt' OVERWRITE INTO TABLE t_hive ;
Copying data from file:/home/hadoop/hive/demo/t_hive.txt
Copying file: file:/home/hadoop/hive/demo/t_hive.txt
Loading data to table default.t_hive
Deleted hdfs://c1.wtmart.com:9000/user/hive/warehouse/t_hive
OK
Time taken: 0.397 seconds
查看表和数据


#查看表 
hive> show tables;
OK
t_hive
Time taken: 0.099 seconds


#正则匹配表名
hive>show tables '*t*';
OK
t_hive
Time taken: 0.065 seconds


#查看表数据
hive> select * from t_hive;
OK
16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34
Time taken: 0.264 seconds


#查看表结构
hive> desc t_hive;
OK
a       int
b       int
c       int
Time taken: 0.1 seconds
修改表


#增加一个字段
hive> ALTER TABLE t_hive ADD COLUMNS (new_col String);
OK
Time taken: 0.186 seconds
hive> desc t_hive;
OK
a       int
b       int
c       int
new_col string
Time taken: 0.086 seconds


#重命令表名
~ ALTER TABLE t_hive RENAME TO t_hadoop;
OK
Time taken: 0.45 seconds
hive> show tables;
OK
t_hadoop
Time taken: 0.07 seconds
删除表


hive> DROP TABLE t_hadoop;
OK
Time taken: 0.767 seconds


hive> show tables;
OK
Time taken: 0.064 seconds


3. Hive交互式模式
quit,exit:  退出交互式shell
reset: 重置配置为默认值
set <key>=<value> : 修改特定变量的值(如果变量名拼写错误,不会报错)
set :  输出用户覆盖的hive配置变量
set -v : 输出所有Hadoop和Hive的配置变量
add FILE[S] *, add JAR[S] *, add ARCHIVE[S] * : 添加 一个或多个 file, jar, archives到分布式缓存
list FILE[S], list JAR[S], list ARCHIVE[S] : 输出已经添加到分布式缓存的资源。
list FILE[S] *, list JAR[S] *,list ARCHIVE[S] * : 检查给定的资源是否添加到分布式缓存
delete FILE[S] *,delete JAR[S] *,delete ARCHIVE[S] * : 从分布式缓存删除指定的资源
! <command> :  从Hive shell执行一个shell命令
dfs <dfs command> :  从Hive shell执行一个dfs命令
<query string> : 执行一个Hive 查询,然后输出结果到标准输出
source FILE <filepath>:  在CLI里执行一个hive脚本文件


4. 数据导入
还以刚才的t_hive为例。


#创建表结构
hive> CREATE TABLE t_hive (a int, b int, c int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
从操作本地文件系统加载数据(LOCAL)


hive> LOAD DATA LOCAL INPATH '/home/hadoop/hive/demo/t_hive.txt' OVERWRITE INTO TABLE t_hive ;
Copying data from file:/home/hadoop/hive/demo/t_hive.txt
Copying file: file:/home/hadoop/hive/demo/t_hive.txt
Loading data to table default.t_hive
Deleted hdfs://c1.wtmart.com:9000/user/hive/warehouse/t_hive
OK
Time taken: 0.612 seconds


#在HDFS中查找刚刚导入的数据
~ hadoop fs -cat /user/hive/warehouse/t_hive/t_hive.txt


16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34
从HDFS加载数据


创建表t_hive2
hive> CREATE TABLE t_hive2 (a int, b int, c int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


#从HDFS加载数据
hive> LOAD DATA INPATH '/user/hive/warehouse/t_hive/t_hive.txt' OVERWRITE INTO TABLE t_hive2;
Loading data to table default.t_hive2
Deleted hdfs://c1.wtmart.com:9000/user/hive/warehouse/t_hive2
OK
Time taken: 0.325 seconds


#查看数据
hive> select * from t_hive2;
OK
16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34
Time taken: 0.287 seconds
从其他表导入数据




hive> INSERT OVERWRITE TABLE t_hive2 SELECT * FROM t_hive ;


Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201307131407_0002, Tracking URL = http://c1.wtmart.com:50030/jobdetails.jsp?jobid=job_201307131407_0002
Kill Command = /home/hadoop/hive/toolkit/hadoop-1.0.3/libexec/../bin/hadoop job  -Dmapred.job.tracker=hdfs://c1.wtmart.com:9001 -kill job_201307131407_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-07-16 10:32:41,979 Stage-1 map = 0%,  reduce = 0%
2013-07-16 10:32:48,034 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:49,050 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:50,068 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:51,082 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:52,093 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:53,102 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.03 sec
2013-07-16 10:32:54,112 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 1.03 sec
MapReduce Total cumulative CPU time: 1 seconds 30 msec
Ended Job = job_201307131407_0002
Ended Job = -314818888, job is filtered out (removed at runtime).
Moving data to: hdfs://c1.wtmart.com:9000/tmp/hive-cos/hive_2013-07-16_10-32-31_323_5732404975764014154/-ext-10000
Loading data to table default.t_hive2
Deleted hdfs://c1.wtmart.com:9000/user/hive/warehouse/t_hive2
Table default.t_hive2 stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 56, raw_data_size: 0]
7 Rows loaded to t_hive2
MapReduce Jobs Launched:
Job 0: Map: 1   Cumulative CPU: 1.03 sec   HDFS Read: 273 HDFS Write: 56 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 30 msec
OK
Time taken: 23.227 seconds


hive> select * from t_hive2;
OK
16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34
Time taken: 0.134 seconds
创建表并从其他表导入数据


#删除表
hive> DROP TABLE t_hive;


#创建表并从其他表导入数据
hive> CREATE TABLE t_hive AS SELECT * FROM t_hive2 ;


Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201307131407_0003, Tracking URL = http://c1.wtmart.com:50030/jobdetails.jsp?jobid=job_201307131407_0003
Kill Command = /home/hadoop/hive/toolkit/hadoop-1.0.3/libexec/../bin/hadoop job  -Dmapred.job.tracker=hdfs://c1.wtmart.com:9001 -kill job_201307131407_0003
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-07-16 10:36:48,612 Stage-1 map = 0%,  reduce = 0%
2013-07-16 10:36:54,648 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.13 sec
2013-07-16 10:36:55,657 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.13 sec
2013-07-16 10:36:56,666 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.13 sec
2013-07-16 10:36:57,673 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.13 sec
2013-07-16 10:36:58,683 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.13 sec
2013-07-16 10:36:59,691 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 1.13 sec
MapReduce Total cumulative CPU time: 1 seconds 130 msec
Ended Job = job_201307131407_0003
Ended Job = -670956236, job is filtered out (removed at runtime).
Moving data to: hdfs://c1.wtmart.com:9000/tmp/hive-cos/hive_2013-07-16_10-36-39_986_1343249562812540343/-ext-10001
Moving data to: hdfs://c1.wtmart.com:9000/user/hive/warehouse/t_hive
Table default.t_hive stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 56, raw_data_size: 0]
7 Rows loaded to hdfs://c1.wtmart.com:9000/tmp/hive-cos/hive_2013-07-16_10-36-39_986_1343249562812540343/-ext-10000
MapReduce Jobs Launched:
Job 0: Map: 1   Cumulative CPU: 1.13 sec   HDFS Read: 272 HDFS Write: 56 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 130 msec
OK
Time taken: 20.13 seconds


hive> select * from t_hive;
OK
16      2       3
61      12      13
41      2       31
17      21      3
71      2       31
1       12      34
11      2       34
Time taken: 0.109 seconds
仅复制表结构不导数据


hive> CREATE TABLE t_hive3 LIKE t_hive;
hive> select * from t_hive3;
OK
Time taken: 0.077 seconds
从MySQL数据库导入数据
我们将在介绍Sqoop时讲。


5. 数据导出
从HDFS复制到HDFS其他位置


~ hadoop fs -cp /user/hive/warehouse/t_hive /


~ hadoop fs -ls /t_hive
Found 1 items
-rw-r--r--   1 cos supergroup         56 2013-07-16 10:41 /t_hive/000000_0


~ hadoop fs -cat /t_hive/000000_0
1623
611213
41231
17213
71231
11234
11234


通过Hive导出到本地文件系统


hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/t_hive' SELECT * FROM t_hive;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201307131407_0005, Tracking URL = http://c1.wtmart.com:50030/jobdetails.jsp?jobid=job_201307131407_0005
Kill Command = /home/hadoop/hive/toolkit/hadoop-1.0.3/libexec/../bin/hadoop job  -Dmapred.job.tracker=hdfs://c1.wtmart.com:9001 -kill job_201307131407_0005
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-07-16 10:46:24,774 Stage-1 map = 0%,  reduce = 0%
2013-07-16 10:46:30,823 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:31,833 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:32,844 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:33,856 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:34,865 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:35,873 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.87 sec
2013-07-16 10:46:36,884 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 0.87 sec
MapReduce Total cumulative CPU time: 870 msec
Ended Job = job_201307131407_0005
Copying data to local directory /tmp/t_hive
Copying data to local directory /tmp/t_hive
7 Rows loaded to /tmp/t_hive
MapReduce Jobs Launched:
Job 0: Map: 1   Cumulative CPU: 0.87 sec   HDFS Read: 271 HDFS Write: 56 SUCCESS
Total MapReduce CPU Time Spent: 870 msec
OK
Time taken: 23.369 seconds


#查看本地操作系统
hive> ! cat /tmp/t_hive/000000_0;
hive> 1623
611213
41231
17213
71231
11234
11234


6. Hive查询HiveQL
注:以下代码将去掉map,reduce的日志输出部分。


普通查询:排序,列别名,嵌套子查询


hive> FROM (
    >   SELECT b,c as c2 FROM t_hive
    > ) t
    > SELECT t.b, t.c2
    > WHERE b>2
    > LIMIT 2;
12      13
21      3
连接查询:JOIN


hive> SELECT t1.a,t1.b,t2.a,t2.b
    > FROM t_hive t1 JOIN t_hive2 t2 on t1.a=t2.a
    > WHERE t1.c>10;


1       12      1       12
11      2       11      2
41      2       41      2
61      12      61      12
71      2       71      2
聚合查询1:count, avg


hive> SELECT count(*), avg(a) FROM t_hive;
7       31.142857142857142
聚合查询2:count, distinct




hive> SELECT count(DISTINCT b) FROM t_hive;
3
聚合查询3:GROUP BY, HAVING


#GROUP BY
hive> SELECT avg(a),b,sum(c) FROM t_hive GROUP BY b,c
16.0    2       3
56.0    2       62
11.0    2       34
61.0    12      13
1.0     12      34
17.0    21      3


#HAVING
hive> SELECT avg(a),b,sum(c) FROM t_hive GROUP BY b,c HAVING sum(c)>30
56.0    2       62
11.0    2       34
1.0     12      34


7. Hive视图
Hive视图和数据库视图的概念是一样的,我们还以t_hive为例。


hive> CREATE VIEW v_hive AS SELECT a,b FROM t_hive where c>30;
hive> select * from v_hive;
41      2
71      2
1       12
11      2


删除视图


hive> DROP VIEW IF EXISTS v_hive;
OK
Time taken: 0.495 seconds
8. Hive分区表
分区表是数据库的基本概念,但很多时候数据量不大,我们完全用不到分区表。Hive是一种OLAP数据仓库软件,涉及的数据量是非常大的,所以分区表在这个场景就显得非常重要!!


下面我们重新定义一个数据表结构:t_hft


创建数据


~ vi /home/hadoop/hive/demo/t_hft_20130627.csv
000001,092023,9.76
000002,091947,8.99
000004,092002,9.79
000005,091514,2.2
000001,092008,9.70
000001,092059,9.45


~ vi /home/hadoop/hive/demo/t_hft_20130628.csv
000001,092023,9.76
000002,091947,8.99
000004,092002,9.79
000005,091514,2.2
000001,092008,9.70
000001,092059,9.45
创建数据表




DROP TABLE IF EXISTS t_hft;
CREATE TABLE t_hft(
SecurityID STRING,
tradeTime STRING,
PreClosePx DOUBLE
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
创建分区数据表
根据业务:按天和股票ID进行分区设计


DROP TABLE IF EXISTS t_hft;
CREATE TABLE t_hft(
SecurityID STRING,
tradeTime STRING,
PreClosePx DOUBLE
) PARTITIONED BY (tradeDate INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
导入数据


#20130627
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hive/demo/t_hft_20130627.csv' OVERWRITE INTO TABLE t_hft PARTITION (tradeDate=20130627);
Copying data from file:/home/hadoop/hive/demo/t_hft_20130627.csv
Copying file: file:/home/hadoop/hive/demo/t_hft_20130627.csv
Loading data to table default.t_hft partition (tradedate=20130627)


#20130628
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hive/demo/t_hft_20130628.csv' OVERWRITE INTO TABLE t_hft PARTITION (tradeDate=20130628);
Copying data from file:/home/hadoop/hive/demo/t_hft_20130628.csv
Copying file: file:/home/hadoop/hive/demo/t_hft_20130628.csv
Loading data to table default.t_hft partition (tradedate=20130628)


查看分区表


hive> SHOW PARTITIONS t_hft;
tradedate=20130627
tradedate=20130628
Time taken: 0.082 seconds
查询数据


hive> select * from t_hft where securityid='000001';
000001  092023  9.76    20130627
000001  092008  9.7     20130627
000001  092059  9.45    20130627
000001  092023  9.76    20130628
000001  092008  9.7     20130628
000001  092059  9.45    20130628


hive> select * from t_hft where tradedate=20130627 and PreClosePx<9;
000002  091947  8.99    20130627
000005  091514  2.2     20130627
Hive基于使用完成.。


转载请注明出处:
http://write.blog.csdn.net/postedit

0 0
原创粉丝点击