hadoop2.6数据导入elasticsearch2.2(解析hbase导出数据)

来源:互联网 发布:数据库 migration 编辑:程序博客网 时间:2024/06/10 18:32

参考网址:

https://www.elastic.co/guide/en/elasticsearch/hadoop/current/mapreduce.html


1.下载依赖jar

elasticsearch-hadoop2.2.0.jar这个从私服下载吧。

2.数据流向是

hbase导出数据-》hdfs-》es2

3.以下直接粘贴代码

<span style="font-weight: bold;"></span>import java.util.Map.Entry;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.mapreduce.MutationSerialization;import org.apache.hadoop.hbase.mapreduce.ResultSerialization;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.elasticsearch.hadoop.mr.EsOutputFormat;import org.elasticsearch.hadoop.mr.LinkedMapWritable;public class MyJob extends Configured implements Tool {@Overridepublic int run(String[] args) throws Exception {Path input = new Path(args[0]);Configuration conf = getConf();      conf.setBoolean("mapred.map.tasks.speculative.execution", false);        conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);    conf.setStrings("io.serializations", conf.get("io.serializations"),            MutationSerialization.class.getName(), ResultSerialization.class.getName());//如果是hbase0.9导出数据就不需要这个参数了,1以上需要//conf.set("es.nodes", "host228"); // index or indices used for storing dataconf.set("es.port", "9200"); // index or indices used for storing data//conf.set("es.resource", "ehlindex/tr_plate"); // index or indices used for storing dataGenericOptionsParser parser = new GenericOptionsParser(conf, args); for (Entry<String, String> entry : conf) {              System.out.printf("%s=%s\n", entry.getKey(), entry.getValue());          } Job job = Job.getInstance(conf, "hfile 2 es");job.setJarByClass(MyJob.class);FileInputFormat.addInputPath(job, input);job.setInputFormatClass(SequenceFileInputFormat.class);job.setOutputFormatClass(EsOutputFormat.class);job.setMapOutputValueClass(LinkedMapWritable.class);       job.setNumReduceTasks(0);job.setMapperClass(MyMaper.class);return job.waitForCompletion(true)?0:1;}public static void main(String[] args) throws Exception {int run = ToolRunner.run( new MyJob(), args);System.exit(run);}}<strong></strong>
<span style="font-weight: bold;"></span>import java.io.IOException;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.elasticsearch.hadoop.mr.LinkedMapWritable;import com.ehl.im.transfer.TRFieldEnum;import com.ehl.im.transfer.TravelRecord;public class MyMaper extends  Mapper<ImmutableBytesWritable, Result, NullWritable, LinkedMapWritable>{ protected void map(ImmutableBytesWritable key, Result value,Context context)        throws IOException, InterruptedException {if("true".equals( context.getConfiguration().get("notinsert"))){return ;} try {LinkedMapWritable linkObj = result2Map(value);context.write(NullWritable.get(), linkObj);} catch (Exception e) {e.printStackTrace();} }    private LinkedMapWritable result2Map(Result r){ LinkedMapWritable linkObj = new LinkedMapWritable();byte[] passCarRowValue = r.getValue("cf".getBytes(), null);TravelRecord record = new TravelRecord(passCarRowValue);linkObj.put(new Text("timestamp"),new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.TIMESTAMP))));linkObj.put(new Text("car_plate_number"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER)));try {linkObj.put(new Text("carplateindex"),new Text(CarPlateCommonUtil.produceCarPlateIndexStr( record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER))) );} catch (Exception e) {e.printStackTrace();}linkObj.put(new Text("speed"), new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.SPEED))));linkObj.put(new Text("lane_id"),new Text(record.getStringValue(TRFieldEnum.LANE_ID)));linkObj.put(new Text("camera_location"),new Text(record.getStringValue(TRFieldEnum.CAMERA_LOCATION)));linkObj.put(new Text("bay_id"),new Text(record.getStringValue(TRFieldEnum.BAY_ID)));linkObj.put(new Text("camera_orientation"),new Text(record.getStringValue(TRFieldEnum.CAMERA_ORIENTATION)));linkObj.put(new Text("car_brand"),new Text(record.getStringValue(TRFieldEnum.CAR_BRAND)));linkObj.put(new Text("car_color"),new Text(record.getStringValue(TRFieldEnum.CAR_COLOR)));linkObj.put(new Text("car_plate_color"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_COLOR)));linkObj.put(new Text("car_plate_type"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_TYPE)));linkObj.put(new Text("car_status"),new Text(record.getStringValue(TRFieldEnum.CAR_STATUS)));linkObj.put(new Text("travel_orientation"),new Text(record.getStringValue(TRFieldEnum.TRAVEL_ORIENTATION)));linkObj.put(new Text("plate_coordinates"),new Text(record.getStringValue(TRFieldEnum.PLATE_COORDINATES)));linkObj.put(new Text("driver_coordinates"),new Text(record.getStringValue(TRFieldEnum.DRIVER_COORDINATES)));String[] imgUrls = record.getStringArrayValue(TRFieldEnum.IMAGE_URLS);if (imgUrls != null) {if (imgUrls.length >= 1 && imgUrls[0] != null && !"".equals(imgUrls[0])) {linkObj.put(new Text("tp1"),new Text(imgUrls[0]));}if (imgUrls.length >= 2 && imgUrls[1] != null && !"".equals(imgUrls[1])) {linkObj.put(new Text("tp2"),new Text(imgUrls[1]));}if (imgUrls.length >= 3 && imgUrls[2] != null && !"".equals(imgUrls[2])) {linkObj.put(new Text("tp3"),new Text(imgUrls[2]));}}return linkObj; }  }<strong></strong>
以下是建立es的索引
curl -XPOST host213:9200/ehlindex -d '{ "settings" : { "number_of_shards" : 20,"number_of_replicas" : 0 }, "mappings" : { "tr_plate" : { "properties" : { "timestamp" : { "type" : "long", "index" : "not_analyzed" } , "car_plate_number" : { "type" : "string", "index" : "not_analyzed" } , "speed" : { "type" : "long", "index" : "not_analyzed" } , "lane_id" : { "type" : "string", "index" : "not_analyzed" } , "camera_location" : { "type" : "string", "index" : "not_analyzed" } , "bay_id" : { "type" : "string", "index" : "not_analyzed" } , "camera_orientation" : { "type" : "string", "index" : "not_analyzed" } , "car_brand" : { "type" : "string", "index" : "not_analyzed" } ,"car_color" : { "type" : "string", "index" : "not_analyzed" } ,"car_plate_color" : { "type" : "string", "index" : "not_analyzed" } ,"car_plate_type" : { "type" : "string", "index" : "not_analyzed" } ,"tp1" : { "type" : "string", "index" : "not_analyzed" } ,"tp2" : { "type" : "string", "index" : "not_analyzed" } ,"tp3" : { "type" : "string", "index" : "not_analyzed" } ,"car_status" : { "type" : "string", "index" : "not_analyzed" } ,"travel_orientation" : {  "type" : "string", "index" : "not_analyzed"  } ,"plate_coordinates" : {  "type" : "string", "index" : "not_analyzed"  } ,"driver_coordinates" : {  "type" : "string", "index" : "not_analyzed"  } , "carplateindex" : { "type" : "string", "index" : "analyzed" } } } } }'


执行mr需要以下参数,命令如下:

hadoop jar downloads/Hfile2Es-0.0.1-SNAPSHOT-jar-with-dependencies.jar -D  es.resource=ehlindex/tr_plate -D es.nodes=host228   /yangxTest/qhd_data1/qhd_data1

这些参数在MyJob中都能获取,所以比较灵活,纯粹干货自己消化吧





0 1
原创粉丝点击