mr解析xml将数据(ipv4、ipv6)批量导入hbase
来源:互联网 发布:主板稳定性测试软件 编辑:程序博客网 时间:2024/05/19 16:48
首先在hbase中建立相应的表:
Jan 23 19:59:00 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
Feb 20 06:25:04 h107 rsyslogd: [origin software="rsyslogd" swVersion="8.4.2" x-pid="22204" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
Jan 24 19:59:01 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
下面的这个方法是通过java内置类来转换ipv6和ipv4的:
[hadoop@h72 hui]$ vi dao.xml
[hadoop@h73 hui]$ vi dao.xml
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar messages33*class
[hadoop@h71 q1]$ hadoop jar xx.jar messages33
hbase(main):003:0> create 'messages','cf'
[hadoop@h71 hui]$ vi messages3.java
import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.Locale;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;public class messages3 { public static void main(String[] args) throws Exception { final Configuration configuration = new Configuration(); configuration.set("hbase.zookeeper.quorum", "192.168.8.71"); configuration.set(TableOutputFormat.OUTPUT_TABLE, "messages"); configuration.set("dfs.socket.timeout", "180000"); final Job job = new Job(configuration, "HBaseBatchImport"); job.setJarByClass(messages3.class); job.setMapperClass(BatchImportMapper.class); job.setReducerClass(BatchImportReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TableOutputFormat.class); FileInputFormat.setInputPaths(job, "hdfs://192.168.8.71:9000/messages"); job.waitForCompletion(true); } static class BatchImportMapper extends Mapper<LongWritable, Text, LongWritable, Text> { Text v2 = new Text(); protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException { SAXReader reader = new SAXReader(); Document document = null; try { document = reader.read("/home/hadoop/hui/dao.xml"); } catch (DocumentException e1) { e1.printStackTrace(); } Element root = document.getRootElement(); List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao"); String h2 = ((Attribute)e2.get(0)).getText(); final String[] splited = value.toString().split(h2); if(splited[3].length()<5){ System.out.println("gai hang shu jv wu yong"); }else{ try { final String date0 = splited[0]+" "+splited[1]+" "+splited[2]; SimpleDateFormat dateformat1 = new SimpleDateFormat("MMM dd HH:mm:ss",Locale.ENGLISH); Date date=dateformat1.parse(date0); SimpleDateFormat datef=new SimpleDateFormat("MMddHHmmss"); String date1="2017"+datef.format(date); String rowKey = date1; v2.set(rowKey + " " + value.toString()); context.write(key, v2); } catch (NumberFormatException e) { final Counter counter = context.getCounter("BatchImport", "ErrorFormat"); counter.increment(1L); System.out.println("chu cuo le" + splited[0] + " " + e.getMessage()); } catch (ParseException e) { e.printStackTrace(); } } } } static class BatchImportReducer extends TableReducer<LongWritable, Text, NullWritable> { protected void reduce(LongWritable key, java.lang.Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException { SAXReader reader = new SAXReader(); Document document = null; try { document = reader.read("/home/hadoop/hui/dao.xml"); } catch (DocumentException e1) { e1.printStackTrace(); } Element root = document.getRootElement(); List e = document.selectNodes("/peizhi/hbase/zhengze"); List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao"); String h1 = ((Element)e.get(e.size()-1)).getText(); String h2 = ((Attribute)e2.get(0)).getText(); for (Text text : values) { final String[] splited = text.toString().split(h2); final Put put = new Put(Bytes.toBytes(splited[0])); for (Iterator i = root.element("hbase").elementIterator(); i.hasNext();) { Element element = (Element) i.next(); String name = null; String neirong = null; int a = 0; if (element.getQualifiedName().equals("ziduan")) { name = element.attributeValue("name"); neirong = element.getText(); a = Integer.parseInt(neirong); String pattern1 = h1; Pattern p1 = Pattern.compile(pattern1); Matcher matcher1 = p1.matcher(splited[4]); if(matcher1.matches()){ //splited[0]=20170123195900 put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(splited[a+1])); context.write(NullWritable.get(), put); }else{ break; } } } } } }}[hadoop@h71 hui]$ hadoop fs -cat hdfs://192.168.8.71:9000/messages
Jan 23 19:59:00 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
Feb 20 06:25:04 h107 rsyslogd: [origin software="rsyslogd" swVersion="8.4.2" x-pid="22204" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
Jan 24 19:59:01 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
[hadoop@h71 hui]$ vi dao.xml
<?xml version="1.0" encoding="UTF-8"?><peizhi> <hbase> <jianbao name="messages"></jianbao> <liezu name="cf"></liezu> <ziduan name="ipv4">3</ziduan> <ziduan name="ipv6">3</ziduan> <ziduan name="host">4</ziduan> <ziduan name="leixing">5</ziduan> <guolv ip="h107"></guolv> <fengefu fuhao=" "></fengefu> <zhengze>^([\d.]+)</zhengze> </hbase></peizhi>
所遇到的坑:
必须将dom4j-1.6.1.jar和jaxen-1.1-beta-7.jar同时导入所有节点的hbase-1.0.0-cdh5.5.2/lib/目录下,一开始我只将dom4j-1.6.1.jar导入了主节点的相应目录下报这个错:17/03/17 19:12:12 INFO mapreduce.Job: Task Id : attempt_1489747351579_0006_r_000000_0, Status : FAILEDError: java.lang.ClassNotFoundException: org.dom4j.DocumentException at java.net.URLClassLoader$1.run(URLClassLoader.java:366) at java.net.URLClassLoader$1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Class.java:270) at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2138) at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2103) at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2197) at org.apache.hadoop.mapreduce.task.JobContextImpl.getReducerClass(JobContextImpl.java:220) at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:611) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)必须要将所需要的xml文件导入到所有节点的相应目录下,一开始我只导入到了主节点的相应目录下报这个错:
Error: java.lang.NullPointerException at messages3$BatchImportMapper.map(messages3.java:70) at messages3$BatchImportMapper.map(messages3.java:56) at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
下面的这个方法是通过java内置类来转换ipv6和ipv4的:
[hadoop@h71 q1]$ vi messages33.java
import java.net.Inet4Address;import java.net.Inet6Address;import java.net.InetAddress;import java.net.UnknownHostException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Iterator;import java.util.List;import java.util.Locale;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;public class messages33 { public static void main(String[] args) throws Exception { final Configuration configuration = new Configuration(); configuration.set("hbase.zookeeper.quorum", "192.168.8.71"); configuration.set(TableOutputFormat.OUTPUT_TABLE, "messages"); configuration.set("dfs.socket.timeout", "180000"); final Job job = new Job(configuration, "HBaseBatchImport"); job.setJarByClass(messages33.class); job.setMapperClass(BatchImportMapper.class); job.setReducerClass(BatchImportReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TableOutputFormat.class); FileInputFormat.setInputPaths(job, "hdfs://192.168.8.71:9000/messages"); job.waitForCompletion(true); } static class BatchImportMapper extends Mapper<LongWritable, Text, LongWritable, Text> { Text v2 = new Text(); protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException { SAXReader reader = new SAXReader(); Document document = null; try {document = reader.read("/home/hadoop/hui/dao.xml");} catch (DocumentException e1) {e1.printStackTrace();} Element root = document.getRootElement(); List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao"); String h2 = ((Attribute)e2.get(0)).getText(); final String[] splited = value.toString().split(h2); try { final String date0 = splited[0]+" "+splited[1]+" "+splited[2]; SimpleDateFormat dateformat1 = new SimpleDateFormat("MMM dd HH:mm:ss",Locale.ENGLISH); Date date=dateformat1.parse(date0); SimpleDateFormat datef=new SimpleDateFormat("MMddHHmmss"); String date1="2017"+datef.format(date); String rowKey = date1; v2.set(rowKey + " " + value.toString()); context.write(key, v2); } catch (NumberFormatException e) { final Counter counter = context.getCounter("BatchImport", "ErrorFormat"); counter.increment(1L); System.out.println("chu cuo le" + splited[0] + " " + e.getMessage()); } catch (ParseException e) {e.printStackTrace();} } } static class BatchImportReducer extends TableReducer<LongWritable, Text, NullWritable> { protected void reduce(LongWritable key, java.lang.Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException { SAXReader reader = new SAXReader(); Document document = null; try {document = reader.read("/home/hadoop/hui/dao.xml");} catch (DocumentException e1) {e1.printStackTrace();} Element root = document.getRootElement(); List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao"); String h2 = ((Attribute)e2.get(0)).getText(); for (Text text : values) { final String[] splited = text.toString().split(h2); final Put put = new Put(Bytes.toBytes(splited[0])); for (Iterator i = root.element("hbase").elementIterator(); i.hasNext();) { Element element = (Element) i.next(); String name = null; String neirong = null; int a = 0; InetAddress addressIPv6 = null; Inet6Address IPv6 = null; Inet4Address IPv4 = null; if (element.getQualifiedName().equals("ziduan")) { name = element.attributeValue("name"); neirong = element.getText(); a = Integer.parseInt(neirong); try{ addressIPv6 = InetAddress.getByName(splited[a+1]); } catch (UnknownHostException e1){ e1.printStackTrace(); } if(name.equals("ipv4")){ if(addressIPv6 instanceof Inet4Address){ IPv4 = (Inet4Address) addressIPv6;// System.out.println("addressIPv4 =" + addressIPv6.getHostAddress()); put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(addressIPv6.getHostAddress())); context.write(NullWritable.get(), put); } } else if(name.equals("ipv6")){ if(addressIPv6 instanceof Inet6Address){ IPv6 = (Inet6Address) addressIPv6;// System.out.println("addressIPv6 =" + addressIPv6.getHostAddress()); put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(addressIPv6.getHostAddress())); context.write(NullWritable.get(), put); }else if(splited[a+1].equals("h107")){ break; } }else{ put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(splited[a+1])); context.write(NullWritable.get(), put); } } } } } }}[hadoop@h71 hui]$ vi dao.xml
[hadoop@h72 hui]$ vi dao.xml
[hadoop@h73 hui]$ vi dao.xml
<?xml version="1.0" encoding="UTF-8"?><peizhi><hbase><jianbao name="messages"></jianbao><liezu name="cf"></liezu><ziduan name="ipv4">3</ziduan><ziduan name="ipv6">3</ziduan><ziduan name="host">4</ziduan><ziduan name="leixing">5</ziduan><fengefu fuhao=" "></fengefu></hbase></peizhi>[hadoop@h71 ~]$ hadoop fs -cat /messages
Jan 23 19:59:00 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnamesFeb 20 06:25:04 h107 rsyslogd: [origin software="rsyslogd" swVersion="8.4.2" x-pid="22204" x-info="http://www.rsyslog.com"] rsyslogd was HUPedJan 24 19:59:01 :: s_sys@hui trafficlogger: empty map for 1:4097 in classnamesJan 23 19:59:02 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac messages33.java
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar messages33*class
[hadoop@h71 q1]$ hadoop jar xx.jar messages33
hbase(main):108:0> scan 'messages'ROW COLUMN+CELL 20170123195900 column=cf:host, timestamp=1489827720926, value=s_sys@hui 20170123195900 column=cf:ipv4, timestamp=1489827720926, value=192.168.101.254 20170123195900 column=cf:leixing, timestamp=1489827720926, value=trafficlogger: 20170123195902 column=cf:host, timestamp=1489827720926, value=s_sys@hui 20170123195902 column=cf:ipv4, timestamp=1489827720926, value=192.168.101.254 20170123195902 column=cf:leixing, timestamp=1489827720926, value=trafficlogger: 20170124195901 column=cf:host, timestamp=1489827720926, value=s_sys@hui 20170124195901 column=cf:ipv6, timestamp=1489827720926, value=0:0:0:0:0:0:0:0 20170124195901 column=cf:leixing, timestamp=1489827720926, value=trafficlogger: 3 row(s) in 0.0190 seconds
阅读全文
0 0
- mr解析xml将数据(ipv4、ipv6)批量导入hbase
- 利用MR程序将数据从hbase中导入mysql
- IPv4与IPv6数据报格式解析
- 解决mapreduce无法将数据批量导入hbase的问题
- Hbase通过命令将数据批量导入的方法
- 初识Hbase-使用批量导入将.TSV文件中数据导入到Hbase表中
- Flume将数据导入Hbase
- Hbase葱岭探秘--MR导入与分析数据
- libpcap报文解析: ipv4、ipv6(待优化)
- Hbase结合MapReduce批量导入数据
- Hbase 批量数据BulkLoad 导入使用
- libpcap报文解析: ipv4、ipv6 @ 2014.7.2
- ipv6和ipv4(1)
- 将HDFS中的数据导入HBase
- Spark将HDFS数据导入到HBase
- Spark将HDFS数据导入到HBase
- Sqoop将SQLServer数据导入HBase
- 用sqoop将oracle数据导入Hbase
- MAC 设置环境变量path的几种方法
- 适配器模式
- oracle 生成默认的子分区
- Windows10安装Linux子系统Ubuntu
- 配置kdevelop+stlink 的STM32开发环境
- mr解析xml将数据(ipv4、ipv6)批量导入hbase
- 解决PL/SQL Developer经常自动断开连接的办法
- 15、Android开发基础之数据检查和读取
- android 创建多个dimens文件的作用
- SPSS分析技术(导航页)
- 不负韶华
- 16、Android开发基础之把数据存储到SD卡上以及SD卡相关的API
- android代码混淆
- xcode真机调试包 11.2 (15C5097c)