3000道POJ英文题中高频词汇--HADOOP单词统计

来源:互联网 发布:苹果安装什么软件 编辑:程序博客网 时间:2024/05/17 05:05

最近学习HADOOP,写mapreduce,最简单的单词统计。

以前刷题,英语不好,很烦。

现在统计一下poj上单词。

首先,抓取了1000--4000的英文题目,对数据进行清洗,把不需要的数字,中文,各种奇怪的符号都去掉。

然后直接跑mapreduce

统计出这3000道题才有不到800个不同的单词,先看一下结果。

each19are21by21input23that24line26be33will33The41number42and58is60in61to63a65of129the226


统计结果还是可以被参考的。

爬虫(不太会,low)

import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import org.junit.Test;public class Pc {public static String getHtml(String urlString) {try {StringBuffer html = new StringBuffer();URL url = new URL(urlString);HttpURLConnection conn = (HttpURLConnection) url.openConnection();InputStreamReader isr = new InputStreamReader(conn.getInputStream());BufferedReader br = new BufferedReader(isr);String temp;while ((temp = br.readLine()) != null) {html.append(temp).append("\n");}br.close();isr.close();return html.toString();} catch (Exception e) {e.printStackTrace();return null;}}@Testpublic static void xiewenjian(String str) throws Exception {byte[] b=str.getBytes();    FileOutputStream out = new FileOutputStream("d:/poj11.txt", true);    out.write(b);}public void zhuaqu(String url,int k) throws Exception {String s = getHtml(url);String head = "<div class=\"ptx\" lang=\"en-US\">";int h1 = s.indexOf(head,k);if(h1==-1) {return;}int x = h1 + head.length();String tail = "</div>";int h2 = s.indexOf(tail, x);int y = h2;String str = s.substring(x, y);String str1 = str.replaceAll("\\."," ");String str2 = str1.replaceAll("<.+>"," ");String str3 = str2.replaceAll(" +", " ");String str4 = str3.replaceAll("[^ a-zA-Z]", "");xiewenjian(" "+str4);zhuaqu(url,y);}public static void main(String[] args) throws Exception {Pc p = new Pc();String url0="http://poj.org/problem?id=";for(int i=1001;i<=4000;i++) {String s=Integer.toString(i);String url = url0+s;System.out.println(url);p.zhuaqu(url,0);}}}


统计

package cn.ky.mapreduce.sortwc;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Count {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(Count.class);job.setMapperClass(SMap.class);job.setReducerClass(SReduce.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}public static class SMap extends Mapper<LongWritable, Text, Text, IntWritable>{Text k=new Text();Infbean v=new Infbean();@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {String line=value.toString();String[] words=line.split(" ");for(String word:words) {if(word.equals("")==false) {k.set(word);context.write(k, new IntWritable(1));}}}}public static class SReduce extends Reducer<Text, IntWritable, Text, IntWritable>{@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {int sum=0;for(IntWritable bean:values) {sum+=bean.get();}context.write(key, new IntWritable(sum));}}}


排序
package cn.ky.mapreduce.sortwc;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Sort {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(Sort.class);job.setMapperClass(SMap.class);job.setReducerClass(SReduce.class);job.setMapOutputKeyClass(Infbean.class);job.setMapOutputValueClass(NullWritable.class);job.setOutputKeyClass(Infbean.class);job.setOutputValueClass(NullWritable.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}public static class SMap extends Mapper<LongWritable, Text, Infbean, NullWritable>{Infbean v=new Infbean();@Overrideprotected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Infbean, NullWritable>.Context context)throws IOException, InterruptedException {String line=value.toString();String[] str=line.split("\t");v.set(str[0],Integer.parseInt(str[1]));context.write(v, NullWritable.get());}}public static class SReduce extends Reducer<Infbean, NullWritable, Infbean, NullWritable>{@Overrideprotected void reduce(Infbean key, Iterable<NullWritable> values,Reducer<Infbean, NullWritable, Infbean, NullWritable>.Context context)throws IOException, InterruptedException {context.write(key, NullWritable.get());}}}

自定义类型

package cn.ky.mapreduce.sortwc;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class Infbean implements WritableComparable<Infbean>{private String word;private int count;public void set(String word,int count) {this.word=word;this.count=count;}@Overridepublic void readFields(DataInput in) throws IOException {// TODO Auto-generated method stubthis.word=in.readUTF();this.count=in.readInt();}@Overridepublic void write(DataOutput out) throws IOException {// TODO Auto-generated method stubout.writeUTF(word);out.writeInt(count);}/** * 和冒泡的思想差不多,自己理解。 */@Overridepublic int compareTo(Infbean o) {// TODO Auto-generated method stubif(this.count>o.count) {return 1;}else {return -1;}}public String toString() {return this.word+"\t"+this.count;}public String getWord() {return word;}public void setWord(String word) {this.word = word;}public int getCount() {return count;}public void setCount(int count) {this.count = count;}}


最后全部统计结果
zip1zotz1writing1zac1yoxkin1AACEDGG1AT1An1yax1xul1x1cumhu1court1work1world1currently1wont1curve1Arrange1Another1counting1At1whose1who1whitespace1whilescanfsdsnEOF1whilescanfsdsn1while1counted1Case1when1wheather1whats1B1Businesses1water1Cartesian1wants1could1unambiguous1Consider1DAABEC1using1DNA1uppercase1upper1up1until1unsortedness1unsorted1unlucky1uayet1tzec1Dit1type1During1twentyfive1Exactly1F1trailing1trade1total1tonight1Figure1correspond1Have1From1tied1Further1through1Generate1thought1though1Ginos1thirteen1third1H1Hes1coordinates1coordinate1convert1task1However1Hut1Hyphens1th1I1tens1IONU1taxing1control1supplies1systems1sweep1surrounding1surprising1suppressed1constraints1starting1Imaging1Inc1stores1stopped1stock1Insignificant1step1statements1state1starts1debt1square1Instead1It1J1decided1sold1specifies1spaces1K1sortedwhile1sortedness1L1somehow1some1solve1solution1defined1denominations1signals1singlevalue1Leading1since1simple1concentration1encoding1outside1rather1check1occurrence1making1computer1session1automatically1run1cataloguing1boundary1minimum1civilization1request1look1likely1legal1Problems1justice1Some1back1sharper1believe1University1get1him1much1floating1Month1extra1mean1equal1seems1easier1removed1composed1records1alternating1Ruritania1lately1lot1knotted1sample1assume1punctuation1once1sabbatical1based1differences1occupy1Two1blank1alphabetical1holly1W1national1having1OUTPUT1fourth1rest1Where1Years1financial1match1few1Philately1excluding1phrase1actual1persons1endofline1equally1magnitude1performs1duplicate1made1divided1live1did1after1description1real1Rn1done1arranged1read1overhang1encounted1known1quality1kankin1local1job1purchasing1attempts1service1involving1Note1into1constraint1instance1THE1individual1discovered1including1NumberOfTheDay1immediately1problems1illustrated1problem1printed1second1hotel1O1brute1precision1her1necessarily1graduated1call1possible1capitalization1result1g1When1forms1separate1caused1eroding1floor1plural1finds1about1filled1achieve1field1P1fact1ends1experience1measured1examples1sell1every1phone1equivalent1requests1enough1added1comments1research1common1either1edge1edgedetected1compiling1mac1company1encodes1does1locations1discovery1Property1computed1allocates1dialed1Locations1detected1dont1lexicographical1remaining1reads1scanf1area1per1left1she1Ruritanian1shrinking1ascending1save1asks1exists1koyab1outputs1S1displayed1keypad1Satellite1purposes1religious1judged1orginal1Semicircle1NameOfTheDay1items1Service1issued1do1available1Sometimes1investigating1compute1inversion1Successive1Notice1Mapper1balances1Number1inside1River1information1That1processes1algorithm1indicated1Thus1includes1row1beexactly1To1begin1dial1believed1U1bisects1detection1born1rounded1hyphen1pax1V1shown1hoping1particular1hold1never1hired1past1highest1preceded1build1Mississippi1calculate1nearly1nd1nearest1collectors1grab1endoffile1calling1Waterloo1name1chen1generally1We1card1portfolio1cards1count1catalog1according1force1muan1responsible1Postal1ceh1YEAR1floatingpoint1column1centered1ZWQM1financing1please1finally1mol1files1except1figure1above1fewest1actually1fail1respect1absolute1entries1expressed1Pizza1doing1allocations1design1several1reverse1postage1group1greater2going2give2pop2point2follows2following2seven2respectively2pizza2message2mental2eznab2mem2erosion2mapping2sets2person2manik2make2period2edges2eb2duplicates2see2lost2due2dollar2penny2dialing2life2described2letter2lengths2pair2learned2large2lamat2know2other2kan2ix2inversions2old2intellectual2ok2series2indicating2indicates2included2nonnegative2sequences2none2hyphens2denomination2right2house2new2necessary2had2sign2smaller2list2software2muluk2spell2start2such2take2their2them2then2times2very2write2where2width2within2word2would2zeros2One2OF2between2ben2corresponding2being2RLE2Output2After2Q2both2As2T2TUTGLOP2G2consisting2Louisiana2businesses2They2considering2GINO2Your2Use2Ya2computation2compressed2Year2caban2bank2columns2Input2again2END2axis2average2allocation2alone2cib2C2D2cimi2characters2ahau2also2chuen2another2circle2chicchan2appear2cases2Dont2appears2Help2canac2E3giving3closing3twelve3exact3even3emotional3c3way3but3determine3time3consist3decimal3how3ik3contains3d3your3cycle3process3property3physical3below3peaks3may3No3akbal3been3out3all3string3integers3Since3these3balance3account3measure3M3There3issue3These3occur3professor3single3last3space3long3m3names3must3Y3answer3miles3still3month3current3used4like4want4test4least4called4they4Fred4exactly4imix4combination4length4many4i4its4people4periods4money4All4part4program4were4p4pixel4needs4customer4only4Larry4Maya4containing4X4consists4pixels4denoted4stamp5any5most5e5directory5letters5file5end5tie5N5semicircle5customers5land5best5occurs5he5can5RPS5R5four6standard6format6If6calendar6contain6there6than6beginning6Z6strings6memorable6images6set6In6digits6dates6months6sorted6Tzolkin6two6if7pairs7it7data7This7positive7You7followed7maximum7A7Haab7values7which7day8triple8cycles8sequence8Each8on8more8types8value8no8map8this8date9different9order9next9three9case9integer9was9lines10example10same10form10has10one10numbers11should11not11an11peak11n11stamps11print11have11his12first12with13image13you13or13given13from14For14telephone14year14days15for15at16output16as18each19are21by21input23that24line26be33will33The41number42and58is60in61to63a65of129the226