hadoop MMSEG4 分词实例

来源：互联网发布：mac默认系统编辑：程序博客网时间：2024/06/11 02:02

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>com.nxcjh.iktest</groupId>  <artifactId>iktest</artifactId>  <version>0.0.1-SNAPSHOT</version>  <packaging>jar</packaging>  <name>iktest</name>  <url>http://maven.apache.org</url>  <properties>    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>  </properties>  <dependencies><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-hdfs</artifactId>    <version>2.4.0</version></dependency><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-mapreduce-client-jobclient</artifactId>    <version>2.4.0</version></dependency><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-common</artifactId>    <version>2.4.0</version></dependency><!-- https://mvnrepository.com/artifact/commons-cli/commons-cli --><dependency>    <groupId>commons-cli</groupId>    <artifactId>commons-cli</artifactId>    <version>1.3.1</version></dependency><dependency>    <groupId>com.nxcjh.ikanalyzer</groupId>    <artifactId>ikanalyzer</artifactId>    <version>1.1.0</version></dependency><!-- https://mvnrepository.com/artifact/com.chenlb.mmseg4j/mmseg4j-core --><dependency>    <groupId>com.chenlb.mmseg4j</groupId>    <artifactId>mmseg4j-core</artifactId>    <version>1.10.0</version></dependency>  </dependencies>  <!-- 打依赖包 --> <build>     <plugins>          <plugin>               <artifactId>maven-assembly-plugin</artifactId>               <configuration>                    <archive>                         <manifest>                              <mainClass>com.nxcjh.iktest.IKJob</mainClass>                         </manifest>                    </archive>                    <descriptorRefs>                         <descriptorRef>jar-with-dependencies</descriptorRef>                    </descriptorRefs>               </configuration>               <executions>                    <execution>                         <id>make-assembly</id>                         <phase>package</phase>                         <goals>                              <goal>single</goal>                         </goals>                    </execution>               </executions>          </plugin>     </plugins></build></project>

Mapper

package com.nxcjh.mmseg;import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import com.chenlb.mmseg4j.ComplexSeg;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.Word;public class MmsegMapper extends Mapper<Object, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();File file = new File("/home/jianglu/下载/mmseg4j/data");//词典的目录Dictionary dic = Dictionary.getInstance(file);//建立词典实例，与比较老的版本中不相同。不能直接new。Seg seg = new ComplexSeg(dic); public void map(Object key, Text value, Context context) throws IOException, InterruptedException{MMSeg mmSeg = new MMSeg(new StringReader(value.toString()), seg);Word iks = null;while((iks = mmSeg.next())!=null) {word.set(iks.getString());context.write(word, one);}}}

Reducer

package com.nxcjh.mmseg;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class IKReducer extendsReducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}

Job

package com.nxcjh.mmseg;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class IKJob {public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "word count");    job.setJarByClass(IKJob.class);    job.setMapperClass(MmsegMapper.class);    job.setCombinerClass(IKReducer.class);    job.setReducerClass(IKReducer.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);  }}

0 0