《hbase学习》-02-程序批量put数据到Hbase
来源:互联网 发布:java web项目实例 pdf 编辑:程序博客网 时间:2024/05/22 13:10
1.在hbase中建立表格
create 'test_lcc_mycase','case_lizu'
2。编写生成测试数据的代码
package sparksql.test.domain;public class Mycase { private String c_code ; private String c_rcode ; private String c_region; private String c_cate ; private String c_start ; private String c_end ; private long c_start_m ; private long c_end_m ; private String c_name ; private String c_mark ; @Override public String toString() { return c_code + "," + c_rcode + "," + c_region + "," + c_cate+ "," + c_start + "," + c_end + "," + c_start_m + "," + c_end_m + "," + c_name + "," + c_mark + "\r\n"; } public String getC_code() { return c_code; } public void setC_code(String c_code) { this.c_code = c_code; } public String getC_rcode() { return c_rcode; } public void setC_rcode(String c_rcode) { this.c_rcode = c_rcode; } public String getC_region() { return c_region; } public void setC_region(String c_region) { this.c_region = c_region; } public String getC_cate() { return c_cate; } public void setC_cate(String c_cate) { this.c_cate = c_cate; } public String getC_start() { return c_start; } public void setC_start(String c_start) { this.c_start = c_start; } public String getC_end() { return c_end; } public void setC_end(String c_end) { this.c_end = c_end; } public long getC_start_m() { return c_start_m; } public void setC_start_m(long c_start_m) { this.c_start_m = c_start_m; } public long getC_end_m() { return c_end_m; } public void setC_end_m(long c_end_m) { this.c_end_m = c_end_m; } public String getC_name() { return c_name; } public void setC_name(String c_name) { this.c_name = c_name; } public String getC_mark() { return c_mark; } public void setC_mark(String c_mark) { this.c_mark = c_mark; }}
package sparksql.test.files;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Random;import sparksql.test.domain.Mycase;public class MyCaseFile { public static void main(String[] args) throws ParseException, IOException { // TODO Auto-generated method stub // file(内存)----输入流---->【程序】----输出流---->file(内存) for(int k=1;k<=100;k++){ String fileName = "mycase"+k+".txt"; File file = new File("E:/temp",fileName ); file.createNewFile(); // 创建文件 FileOutputStream in = new FileOutputStream(file); OutputStreamWriter osw = new OutputStreamWriter(in, "UTF-8"); Mycase mycase = new Mycase(); Random r = new Random(); long lt = 0; Date datetwo = null; String start; String end; SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); byte bt[] = new byte[1024]; Long start_m = Long.parseLong("1150992000000"); /* * 1000 0000 * */ //案件数量 for(int i=1000000*k-1000000;i<1000000*k;i++){ mycase.setC_code("A"+i); //生成[0,10)区间的整数 //假设有13个区 int qu = r.nextInt(9); mycase.setC_rcode(qu+""); switch (qu) { case 1: mycase.setC_region("杭州市上城区"); break; case 2: mycase.setC_region("杭州市下城区"); break; case 3: mycase.setC_region("杭州市拱墅区"); break; case 4: mycase.setC_region("杭州市江干区"); break; case 5: mycase.setC_region("杭州市西湖区"); break; case 6: mycase.setC_region("杭州市滨江区"); break; case 7: mycase.setC_region("杭州市萧山区"); break; case 8: mycase.setC_region("杭州市余杭区"); break; case 0: mycase.setC_region("杭州市其他区"); break; default: mycase.setC_region("杭州市其他区"); System.out.println(qu+"没有对应的区"); } //假设有3个案件类别 int c_cate = r.nextInt(4); switch (c_cate) { case 0: mycase.setC_cate("刑事案件"); break; case 1: mycase.setC_cate("盗窃案件"); break; case 2: mycase.setC_cate("强奸案件"); break; case 3: mycase.setC_cate("杀人案件"); break; default: System.out.println(c_cate+"没有对应的案件类别"); } int day = r.nextInt(5); int our = r.nextInt(24); start_m = start_m +86400000*day; String shijiancuo = start_m+""; lt = new Long(shijiancuo); datetwo = new Date(lt); start = simpleDateFormat.format(datetwo); Long end_m = start_m +3600000*our; String shijiancuo2 = end_m+""; lt = new Long(shijiancuo2); datetwo = new Date(lt); end = simpleDateFormat.format(datetwo); mycase.setC_start(start); mycase.setC_end(end); mycase.setC_start_m(start_m); mycase.setC_end_m(end_m); mycase.setC_name("案件名称"+i); mycase.setC_mark("暂无"); // 向文件写入内容(输出流) String str = mycase.toString() ; osw.write(str); osw.flush(); /* bt = str.getBytes(); in.write(bt, 0, bt.length); */ } in.close(); } }}
3。生成测试数据
A0,6,杭州市滨江区,盗窃案件,2006/06/23 00:00:00,2006/06/23 10:00:00,1150992000000,1151028000000,案件名称0,暂无A1,3,杭州市拱墅区,盗窃案件,2006/06/24 00:00:00,2006/06/24 07:00:00,1151078400000,1151103600000,案件名称1,暂无A2,2,杭州市下城区,盗窃案件,2006/06/27 00:00:00,2006/06/27 09:00:00,1151337600000,1151370000000,案件名称2,暂无A3,3,杭州市拱墅区,盗窃案件,2006/07/01 00:00:00,2006/07/01 07:00:00,1151683200000,1151708400000,案件名称3,暂无A4,0,杭州市其他区,强奸案件,2006/07/05 00:00:00,2006/07/05 01:00:00,1152028800000,1152032400000,案件名称4,暂无A5,4,杭州市江干区,杀人案件,2006/07/08 00:00:00,2006/07/08 23:00:00,1152288000000,1152370800000,案件名称5,暂无A6,7,杭州市萧山区,盗窃案件,2006/07/11 00:00:00,2006/07/11 23:00:00,1152547200000,1152630000000,案件名称6,暂无A7,6,杭州市滨江区,刑事案件,2006/07/13 00:00:00,2006/07/13 22:00:00,1152720000000,1152799200000,案件名称7,暂无A8,2,杭州市下城区,杀人案件,2006/07/13 00:00:00,2006/07/13 22:00:00,1152720000000,1152799200000,案件名称8,暂无A9,1,杭州市上城区,杀人案件,2006/07/15 00:00:00,2006/07/15 21:00:00,1152892800000,1152968400000,案件名称9,暂无
4。记得所有步骤全是UTF-8不然会出现乱码问题
5。编写上传代码
package sparksql.hbase.fileToHbase;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.nio.ByteBuffer;import java.text.ParseException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.TableName;import org.apache.hadoop.hbase.client.Connection;import org.apache.hadoop.hbase.client.ConnectionFactory;import org.apache.hadoop.hbase.client.Durability;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.client.Table;import org.apache.hadoop.hbase.util.Bytes;public class MycaseUpToHbase { public static void main(String[] args) throws IOException { Configuration configuration = HBaseConfiguration.create(); configuration.set("hbase.zookeeper.property.clientPort", "2181"); configuration.set("hbase.zookeeper.quorum", "192.168.10.82"); configuration.set("hbase.client.write.buffer", "2097152"); //默认connection实现是org.apache.hadoop.hbase.client.ConnectionManager.HConnectionImplementation Connection connection = ConnectionFactory.createConnection(configuration); //默认table实现是org.apache.hadoop.hbase.client.HTable Table table = connection.getTable(TableName.valueOf("test_lcc_mycase")); //3177不是我杜撰的,是2*hbase.client.write.buffer/put.heapSize()计算出来的 int bestBathPutSize = 3177; try { // Use the table as needed, for a single operation and a single thread // construct List<Put> putLists List<Put> putLists = new ArrayList<Put>(); for(int k=1;k<=100;k++){ String fileName = "mycase"+k+".txt"; String filePath = "E:/temp/data1000000-10000000-10000000/"+fileName; File file = new File(filePath); BufferedReader reader = null; System.out.println("以行为单位读取文件内容,一次读一整行:"); InputStreamReader insReader = new InputStreamReader( new FileInputStream(file), "UTF-8"); reader = new BufferedReader(insReader); String tempString = null; int line = 1; // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { // 显示行号 System.out.println("line " + line + ": " + tempString); String[] array = tempString.split(","); String c_code = array[0]; String c_rcode = array[1]; String c_region = array[2]; String c_cate = array[3]; String c_start = array[4]; String c_end = array[5]; String c_start_m = array[6]; String c_end_m = array[7]; String c_name = array[8]; String c_mark = array[9]; Put put=new Put(Bytes.toBytes(c_code)); //Put put = new Put(rowkey.getBytes()); put.addImmutable("case_lizu".getBytes(), "c_code".getBytes(), c_code.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_rcode".getBytes(), c_rcode.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_region".getBytes(), c_region.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_cate".getBytes(), c_cate.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_start".getBytes(), c_start.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_end".getBytes(), c_end.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_start_m".getBytes(), c_start_m.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_end_m".getBytes(), c_end_m.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_name".getBytes(), c_name.getBytes("UTF-8")); put.addImmutable("case_lizu".getBytes(), "c_mark".getBytes(), c_mark.getBytes("UTF-8")); //这里面c_mark.getBytes("UTF-8") 这个指定utf-8很重要,不然会乱码 put.setDurability(Durability.SKIP_WAL); putLists.add(put); if(putLists.size()==bestBathPutSize){ //达到最佳大小值了,马上提交一把 table.put(putLists); putLists.clear(); } line++; } reader.close(); //剩下的未提交数据,最后做一次提交 table.put(putLists) ; } } finally { table.close(); connection.close(); } }}
6。执行上传代码,进行编写scala读取操作
package sparlsql.hbase;import org.apache.hadoop.hbase.client._import org.apache.hadoop.hbase.io.ImmutableBytesWritableimport org.apache.hadoop.hbase.mapreduce.TableInputFormatimport org.apache.hadoop.hbase.{TableName, HBaseConfiguration}import org.apache.hadoop.hbase.util.Bytesimport org.apache.spark.sql.SQLContextimport org.apache.spark.{SparkContext, SparkConf}import java.util.Date object SparkSQLOnHbase { def main(args: Array[String]): Unit = { val starttime=System.nanoTime // 本地模式运行,便于测试 val sparkConf = new SparkConf().setMaster("local").setAppName("HBaseTest") // 创建hbase configuration val hBaseConf = HBaseConfiguration.create() hBaseConf.set("hbase.zookeeper.property.clientPort", "2181"); hBaseConf.set("hbase.zookeeper.quorum", "192.168.10.82"); //var con = ConnectionFactory.createConnection(hBaseConf) //var table = con.getTable(TableName.valueOf("")) hBaseConf.set(TableInputFormat.INPUT_TABLE,"test_lcc_mycase") // 创建 spark context val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // 从数据源获取数据 var hbaseRDD = sc.newAPIHadoopRDD(hBaseConf,classOf[TableInputFormat],classOf[ImmutableBytesWritable],classOf[Result]) // 将数据映射为表 也就是将 RDD转化为 dataframe schema val mycase = hbaseRDD.map(r=>( Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_code"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_rcode"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_region"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_cate"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_start"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_end"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_start_m"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_end_m"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_name"))), Bytes.toString(r._2.getValue(Bytes.toBytes("case_lizu"),Bytes.toBytes("c_mark"))) )).toDF("c_code","c_rcode","c_region","c_cate","c_start","c_end","c_start_m","c_end_m","c_name","c_mark") mycase.registerTempTable("mycase") // 测试 val df5 = sqlContext.sql("select * from mycase ") df5.show() println(df5.count()) // df5.collect().foreach(print(_)) val endtime=System.nanoTime val delta=endtime-starttime println(delta/1000000d) }}
7.打印结果
+------+-------+--------+------+-------------------+-------------------+-------------+-------------+--------+------+|c_code|c_rcode|c_region|c_cate| c_start| c_end| c_start_m| c_end_m| c_name|c_mark|+------+-------+--------+------+-------------------+-------------------+-------------+-------------+--------+------+| A0| 5| 杭州市西湖区| 强奸案件|2006/06/24 00:00:00|2006/06/24 13:00:00|1151078400000|1151125200000| 案件名称0| 暂无|| A1| 1| 杭州市上城区| 强奸案件|2006/06/25 00:00:00|2006/06/25 22:00:00|1151164800000|1151244000000| 案件名称1| 暂无|| A10| 6| 杭州市滨江区| 刑事案件|2006/07/06 00:00:00|2006/07/06 20:00:00|1152115200000|1152187200000| 案件名称10| 暂无|| A100| 1| 杭州市上城区| 杀人案件|2006/12/31 00:00:00|2006/12/31 10:00:00|1167494400000|1167530400000| 案件名称100| 暂无|| A1000| 6| 杭州市滨江区| 强奸案件|2011/12/25 00:00:00|2011/12/25 03:00:00|1324742400000|1324753200000|案件名称1000| 暂无|| A1001| 4| 杭州市江干区| 强奸案件|2011/12/29 00:00:00|2011/12/29 03:00:00|1325088000000|1325098800000|案件名称1001| 暂无|| A1002| 8| 杭州市余杭区| 盗窃案件|2011/12/30 00:00:00|2011/12/30 09:00:00|1325174400000|1325206800000|案件名称1002| 暂无|| A1003| 1| 杭州市上城区| 刑事案件|2011/12/31 00:00:00|2011/12/31 08:00:00|1325260800000|1325289600000|案件名称1003| 暂无|| A1004| 4| 杭州市江干区| 强奸案件|2011/12/31 00:00:00|2011/12/31 22:00:00|1325260800000|1325340000000|案件名称1004| 暂无|| A1005| 7| 杭州市萧山区| 刑事案件|2012/01/04 00:00:00|2012/01/04 08:00:00|1325606400000|1325635200000|案件名称1005| 暂无|| A1006| 2| 杭州市下城区| 杀人案件|2012/01/06 00:00:00|2012/01/06 20:00:00|1325779200000|1325851200000|案件名称1006| 暂无|| A1007| 4| 杭州市江干区| 刑事案件|2012/01/07 00:00:00|2012/01/07 02:00:00|1325865600000|1325872800000|案件名称1007| 暂无|| A1008| 1| 杭州市上城区| 强奸案件|2012/01/09 00:00:00|2012/01/09 23:00:00|1326038400000|1326121200000|案件名称1008| 暂无|| A1009| 3| 杭州市拱墅区| 强奸案件|2012/01/09 00:00:00|2012/01/09 15:00:00|1326038400000|1326092400000|案件名称1009| 暂无|| A101| 0| 杭州市其他区| 刑事案件|2007/01/02 00:00:00|2007/01/02 05:00:00|1167667200000|1167685200000| 案件名称101| 暂无|| A1010| 1| 杭州市上城区| 强奸案件|2012/01/11 00:00:00|2012/01/11 14:00:00|1326211200000|1326261600000|案件名称1010| 暂无|| A1011| 7| 杭州市萧山区| 盗窃案件|2012/01/15 00:00:00|2012/01/15 12:00:00|1326556800000|1326600000000|案件名称1011| 暂无|| A1012| 3| 杭州市拱墅区| 刑事案件|2012/01/15 00:00:00|2012/01/15 11:00:00|1326556800000|1326596400000|案件名称1012| 暂无|| A1013| 0| 杭州市其他区| 强奸案件|2012/01/17 00:00:00|2012/01/17 06:00:00|1326729600000|1326751200000|案件名称1013| 暂无|| A1014| 6| 杭州市滨江区| 刑事案件|2012/01/18 00:00:00|2012/01/18 15:00:00|1326816000000|1326870000000|案件名称1014| 暂无|+------+-------+--------+------+-------------------+-------------------+-------------+-------------+--------+------+
阅读全文
0 0
- 《hbase学习》-02-程序批量put数据到Hbase
- HBase——使用Put迁移MySql数据到Hbase
- HBase table Put 插入数据
- HBase使用Put插入数据
- 批量(bulkload)载入数据到hbase
- 批量Load到HBase
- 批量Load到HBase
- HBase学习笔记 --- RDBMS sqoop 导入数据到HBase
- hbase数据迁移put方法java代码
- 从hdfs批量导出数据到hbase表中
- 使用java MapReduce job 批量导入大额数据到Hbase
- 一个为spark批量导入数据到hbase的库
- 批量上传非结构化数据到HBase
- hbase源码学习之put操作
- HBase相关源码学习(PUT)
- hive批量导入到hbase
- Hbase之批量数据写入
- 初识Hbase-使用批量导入将.TSV文件中数据导入到Hbase表中
- 以一种访问权限不允许的方式做了一个访问套接字的尝试
- 奖学金
- Java注解(Annotation)
- 扩展阅读 镜子是心灵的导师
- MyEclipse导出javadoc时报java.lang.IllegalArgumentException错误
- 《hbase学习》-02-程序批量put数据到Hbase
- js中用el表达式
- 如何提取PDF文档中的一个页面
- (CSU
- Docker 杂记
- ASP.NET Core
- scala中对象与map转json
- &和&&,|和||的区别
- HDU 2089 数位DP