重复ip（单词计数）不同语言实现对比

来源：互联网发布：黑米软件官方编辑：程序博客网时间：2024/06/06 10:39

Shell

ip.txt

192.168.0.1 zhangxc192.168.0.1 zhangxc1192.168.0.1 zhangxc3192.168.0.2 zhangc192.168.0.2 zhangc1192.168.0.3 zhangc192.168.0.3 zhangc192.168.0.3 zhangxc192.168.0.3 zhangxc192.168.0.0 zhang192.168.0.5 zhang192.168.0.0 zhang192.168.0.0 zhang192.168.0.0 zhang2

awk '{a[$1]++} END{for(i in a)print i,a[i]}' ip.txt

Python

a.log

#111.172.249.84 - - [12/Dec/2011:05:33:36 +0800] "GET /images/i/goTop.png HTTP/1.0" 200 486 "http://wh.xxxx.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"  #111.172.249.84 - - [12/Dec/2011:05:33:36 +0800] "GET /images/i/goTop.png HTTP/1.0" 200 486 "http://wh.xxxx.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"  #111.172.249.85 - - [12/Dec/2011:05:33:36 +0800] "GET /images/i/goTop.png HTTP/1.0" 200 486 "http://wh.xxxx.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"  #111.172.249.86 - - [12/Dec/2011:05:33:36 +0800] "GET /images/i/goTop.png HTTP/1.0" 200 486 "http://wh.xxxx.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"

import rearr={}f=open("./a.log","r")lines = f.readlines()for line in lines:        ipaddress=re.compile(r'^#(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?))')        match=ipaddress.match(line)        if match:                ip = match.group(1)                if(arr.has_key(ip)):                        arr[ip]+=1                else:                        arr.setdefault(ip,1)print arrf.close()for key in arr:        print key+"     "+str(arr[key])

Java

package p;import java.io.BufferedReader;import java.io.FileReader;import java.util.HashMap;import java.util.Map;/** * 读文件 * 存数据 * 计数 * 打印结果 */public class T {// ip, 访问次数cntstatic Map<String, Integer> map = new HashMap<String, Integer>();public static void main(String[] args) throws Exception {FileReader fr = new FileReader("F://ip.txt");BufferedReader br = new BufferedReader(fr);String str = null;while ((str = br.readLine()) != null) {String[] split = str.split(" ");String key = split[0];Integer value = map.get(key);if (value == null) {map.put(key, 1);} else {value++;map.put(key, value);}}System.out.println(map);br.close();fr.close();}}

Hadoop MapReduce

package org.apache.hadoop.examples;import java.io.IOException;import java.io.PrintStream;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordCount{  public static void main(String[] args)    throws Exception  {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "word count");    job.setJarByClass(WordCount.class);    job.setMapperClass(TokenizerMapper.class);    job.setCombinerClass(IntSumReducer.class);    job.setReducerClass(IntSumReducer.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit((job.waitForCompletion(true)) ? 0 : 1);  }  public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable>  {    private IntWritable result;    public IntSumReducer()    {      this.result = new IntWritable();    }    public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException    {      int sum = 0;      for (IntWritable val : values)        sum += val.get();      this.result.set(sum);      context.write(key, this.result);    }  }  public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>  {    private static final IntWritable one = new IntWritable(1);    private Text word;    public TokenizerMapper()    {      this.word = new Text();    }    public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {      StringTokenizer itr = new StringTokenizer(value.toString());      while (itr.hasMoreTokens()) {        this.word.set(itr.nextToken());        context.write(this.word, one);      }    }  }}

Spark

scala> val file = sc.textFile("hdfs://bigdata1:9000/wordcount/wc_in/test1.txt")scala> val count = file.flatMap(line => line.split("\t")).map(word => (word, 1)).reduceByKey(_+_)scala> count.collect()scala> count.saveAsTextFile("hdfs://bigdata1:9000/wordcount/wc_out6")

参考url

http://blog.csdn.net/jiedushi/article/details/7403365

0 0