hadoop MMSEG4 分词实例
来源:互联网 发布:mac默认系统 编辑:程序博客网 时间:2024/06/11 02:02
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.nxcjh.iktest</groupId> <artifactId>iktest</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>iktest</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies><dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.4.0</version></dependency><dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>2.4.0</version></dependency><dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.4.0</version></dependency><!-- https://mvnrepository.com/artifact/commons-cli/commons-cli --><dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> <version>1.3.1</version></dependency><dependency> <groupId>com.nxcjh.ikanalyzer</groupId> <artifactId>ikanalyzer</artifactId> <version>1.1.0</version></dependency><!-- https://mvnrepository.com/artifact/com.chenlb.mmseg4j/mmseg4j-core --><dependency> <groupId>com.chenlb.mmseg4j</groupId> <artifactId>mmseg4j-core</artifactId> <version>1.10.0</version></dependency> </dependencies> <!-- 打依赖包 --> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <archive> <manifest> <mainClass>com.nxcjh.iktest.IKJob</mainClass> </manifest> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins></build></project>
Mapper
package com.nxcjh.mmseg;import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import com.chenlb.mmseg4j.ComplexSeg;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.Word;public class MmsegMapper extends Mapper<Object, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();File file = new File("/home/jianglu/下载/mmseg4j/data");//词典的目录Dictionary dic = Dictionary.getInstance(file);//建立词典实例,与比较老的版本中不相同。不能直接new。Seg seg = new ComplexSeg(dic); public void map(Object key, Text value, Context context) throws IOException, InterruptedException{MMSeg mmSeg = new MMSeg(new StringReader(value.toString()), seg);Word iks = null;while((iks = mmSeg.next())!=null) {word.set(iks.getString());context.write(word, one);}}}
Reducer
package com.nxcjh.mmseg;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class IKReducer extendsReducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}
Job
package com.nxcjh.mmseg;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class IKJob {public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(IKJob.class); job.setMapperClass(MmsegMapper.class); job.setCombinerClass(IKReducer.class); job.setReducerClass(IKReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
0 0
- hadoop MMSEG4 分词实例
- Hadoop入门(2):分词
- Hadoop中文分词
- java中文分词实例
- ansj_seg中文分词实例
- scws 分词简单实例
- mysql分词搜索实例
- Hadoop IK分词 词频统计
- hadoop实例
- lucene3.1.0 简单分词实例
- 基于hadoop的分布式分词程序(庖丁分词)
- 基于hadoop的分词程序(ICTCLAS分词器)
- hadoop中文分词、词频统计及排序
- hadoop 单机 搭建 ,并简单分词
- hadoop学习【8】——基于hadoop的分词程序二(ICTCLAS分词器)
- Lucene 实例 IKAnalyzer中文分词器
- IKanalyzer分词实例并统计词频
- 搜索引擎之猎兔分词实例
- spark 2.0 OneForOneStreamManager
- 删除排序链表的重复节点
- Mysql 如何备份与还原数据库(在Mysql Workbench)
- java util
- linux 下shell中if的“-e,-d,-f”是什么意思
- hadoop MMSEG4 分词实例
- 如何配置https站点
- 使用ab进行页面的压力测试
- GDB学习整理2--gdb常用命令
- throw er; // Unhandled 'error' event
- HDU 5384 Danganronpa AC自动机
- 01-RationalRose的安装
- [问题解决] eclipse.ini文件配置启动JDK
- 深入分析Linux kernel exception框架