Hadoop: MapReduce使用hdfs中的文件
来源:互联网 发布:八卦洲网络问政 编辑:程序博客网 时间:2024/05/16 10:50
本代码包含功能:获取DataNode名,并写入到HDFS文件系统中的文件hdfs:///copyOftest.c中。并计数文件hdfs:///copyOftest.c中的wordcount计数,有别于Hadoop的examples中的读取本地文件系统中的文件,这次读取的是HDFS中的文件。package com.fora;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hdfs.DistributedFileSystem;import org.apache.hadoop.hdfs.protocol.DatanodeInfo;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Mapper.Context;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class FileOperate { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { init();/*初始化文件*/ Configuration conf = new Configuration(); Job job = new Job(conf, "word count"); job.setJarByClass(FileOperate.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); /* set the path of input and output*/ FileInputFormat.addInputPath(job, new Path("hdfs:///copyOftest.c")); FileOutputFormat.setOutputPath(job, new Path("hdfs:///wordcount")); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()){ word.set(itr.nextToken()); context.write(word, one); } }} public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{ int sum = 0; for (IntWritable val : values){ sum += val.get(); } result.set(sum); context.write(key, result); }} public static void init()throws IOException { /*copy local file to hdfs*/ Configuration config = new Configuration(); FileSystem hdfs = null; String srcFile = "/test.c"; String dstFile = "hdfs:///copyOftest.c"; System.out.print("copy success!\n"); hdfs = FileSystem.get(config); Path srcPath = new Path(srcFile); Path dstPath = new Path(dstFile); hdfs.copyFromLocalFile(srcPath, dstPath); String fileName = "hdfs:///copyOftest.c"; Path path = new Path(fileName); FileStatus fileStatus =null; fileStatus = hdfs.getFileStatus(path); System.out.println(fileStatus.getBlockSize()); FileSystem fs = FileSystem.get(config); DistributedFileSystem hdfs1 = (DistributedFileSystem) fs; DatanodeInfo[] dataNodeStats = hdfs1.getDataNodeStats(); /*create a file on hdfs*/ Path Outputpath = new Path("hdfs:///output/listOfDatanode"); FSDataOutputStream outputStream = hdfs.create(Outputpath); String[] names = new String[dataNodeStats.length]; for (int i = 0; i < dataNodeStats.length; i++) { names[i] = dataNodeStats[i].getHostName();/*get the list of datanodes*/ System.out.println(names[i]); /*write the list of datanodes to file on hdfs*/ outputStream.write(names[i].getBytes(), 0, names[i].length()); } }}运行结果:[root@master bin]# hadoop jar HDFS.jar com.fora.FileOperatecopy success!67108864masterslave111/07/21 15:45:23 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.11/07/21 15:45:23 INFO input.FileInputFormat: Total input paths to process : 111/07/21 15:45:23 INFO mapred.JobClient: Running job: job_201107210917_000311/07/21 15:45:24 INFO mapred.JobClient: map 0% reduce 0%11/07/21 15:45:31 INFO mapred.JobClient: map 100% reduce 0%11/07/21 15:45:43 INFO mapred.JobClient: map 100% reduce 100%11/07/21 15:45:45 INFO mapred.JobClient: Job complete: job_201107210917_000311/07/21 15:45:45 INFO mapred.JobClient: Counters: 1711/07/21 15:45:45 INFO mapred.JobClient: Job Counters 11/07/21 15:45:45 INFO mapred.JobClient: Launched reduce tasks=111/07/21 15:45:45 INFO mapred.JobClient: Rack-local map tasks=111/07/21 15:45:45 INFO mapred.JobClient: Launched map tasks=111/07/21 15:45:45 INFO mapred.JobClient: FileSystemCounters11/07/21 15:45:45 INFO mapred.JobClient: FILE_BYTES_READ=22811/07/21 15:45:45 INFO mapred.JobClient: HDFS_BYTES_READ=12611/07/21 15:45:45 INFO mapred.JobClient: FILE_BYTES_WRITTEN=48811/07/21 15:45:45 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=14611/07/21 15:45:45 INFO mapred.JobClient: Map-Reduce Framework11/07/21 15:45:45 INFO mapred.JobClient: Reduce input groups=1911/07/21 15:45:45 INFO mapred.JobClient: Combine output records=1911/07/21 15:45:45 INFO mapred.JobClient: Map input records=811/07/21 15:45:45 INFO mapred.JobClient: Reduce shuffle bytes=22811/07/21 15:45:45 INFO mapred.JobClient: Reduce output records=1911/07/21 15:45:45 INFO mapred.JobClient: Spilled Records=3811/07/21 15:45:45 INFO mapred.JobClient: Map output bytes=21111/07/21 15:45:45 INFO mapred.JobClient: Combine input records=2211/07/21 15:45:45 INFO mapred.JobClient: Map output records=2211/07/21 15:45:45 INFO mapred.JobClient: Reduce input records=19[root@master bin]# hadoop dfs -ls /Found 6 items-rw-r--r-- 1 root supergroup 126 2011-07-21 15:45 /copyOftest.c-rw-r--r-- 1 root supergroup 26 2011-07-21 15:16 /listOfDatanodedrwxr-xr-x - root supergroup 0 2011-07-21 15:45 /output-rw-r--r-- 1 root supergroup 10400 2011-07-20 16:51 /test.txtdrwxr-xr-x - root supergroup 0 2011-07-20 16:09 /tmpdrwxr-xr-x - root supergroup 0 2011-07-21 15:45 /wordcount[root@master bin]# hadoop dfs -ls /wordcountFound 2 itemsdrwxr-xr-x - root supergroup 0 2011-07-21 15:45 /wordcount/_logs-rw-r--r-- 1 root supergroup 146 2011-07-21 15:45 /wordcount/part-r-00000[root@master bin]# hadoop dfs -cat /wordcount/part-r-000002011-07-21 1File 1Hadoop 1System! 1a 1aimed 1at 1coping 1file 3from 1from:fora 1is 1local 1system 1thank 1the 1this 2to 1you! 1[root@master bin]#