海量日志数据,找出出现次数最多的IP地址。
来源:互联网 发布:网游在线人数数据 编辑:程序博客网 时间:2024/05/22 10:27
问题描述
有一个12G的文本文件,每行记录的是一个IP地址,现要找出这个文件中出现次数最多的那个ip。
代码实现
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileReader;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.io.Serializable;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
-
- class IP implements Serializable {
-
- private static final long serialVersionUID = -8903000680469719698L;
- private String ip = "";
- private int count;
-
- public IP(String ip2, Integer integer) {
- this.ip = ip2;
- this.count = integer;
- }
-
- public int getCount() {
- return count;
- }
-
- public String getIp() {
- return ip;
- }
-
- public void setCount(int count) {
- this.count = count;
- }
-
- public void setIp(String ip) {
- this.ip = ip;
- }
-
- }
-
-
-
-
-
-
-
-
-
-
-
-
- public class No2 {
- static String fileLoc = "D:\\bigdata_ip.txt";
-
- public static void findIp() throws IOException, ClassNotFoundException {
- long start = System.currentTimeMillis();
- hashToSmallFiles();
- long end1 = System.currentTimeMillis();
- System.out.println("将大文件映射成小文件,用时:" + (end1 - start) + "毫秒");
-
- System.out.println("映射到小文件完成,开始统计每个小文件中出现频率最高的ip");
- long start1 = System.currentTimeMillis();
- List<IP> list = countEverySmallFile();
- long end2 = System.currentTimeMillis();
- System.out.println("统计所有文件共用时:" + (end2 - start1) + " 毫秒");
-
- System.out.println("统计完成,开始计算所有ip中出现频率最高的ip");
- IP ip = calculateResult(list);
- System.out.println("访问次数最多的ip是:" + ip.getIp() + ":" + ip.getCount());
- long end = System.currentTimeMillis();
- System.out.println("公用时:" + (end - start) + "毫秒");
- }
-
-
-
-
-
-
- private static IP calculateResult(List<IP> list) {
- IP[] ips = new IP[list.size()];
- ips = list.toArray(ips);
- int max = 0;
- for (int j = 1; j < ips.length; j++) {
- if (ips[j].getCount() > ips[max].getCount()) {
- max = j;
- }
- }
- return ips[max];
- }
-
-
-
-
-
-
-
-
- private static List<IP> countEverySmallFile() throws FileNotFoundException, IOException {
- List<IP> list = new ArrayList<IP>();
- for (int i = 0; i < 1024; i++) {
- File file = new File(fileLoc + i + ".txt");
- if (file.exists()) {
- long startTime = System.currentTimeMillis();
- BufferedReader br1 = new BufferedReader(new FileReader(file));
- String ip1 = "";
- HashMap<String, Integer> hm = new HashMap<String, Integer>();
- while ((ip1 = br1.readLine()) != null) {
- if (!hm.containsKey(ip1)) {
- hm.put(ip1, 1);
- } else {
- hm.put(ip1, hm.get(ip1) + 1);
- }
- }
-
- IP[] ips = new IP[hm.size()];
- int index = 0;
- for (String temp : hm.keySet()) {
- ips[index] = new IP(temp, hm.get(temp));
- index++;
- }
- int max = 0;
- for (int j = 1; j < ips.length; j++) {
- if (ips[j].getCount() > ips[max].getCount()) {
- max = j;
- }
- }
- list.add(ips[max]);
- long endTime = System.currentTimeMillis();
- System.out.println("已经统计文件:" + fileLoc + i + ".txt,用时:" + (endTime - startTime) + " 毫秒");
- }
- }
- return list;
- }
-
-
-
-
-
-
-
- private static void hashToSmallFiles() throws FileNotFoundException, IOException {
- BufferedReader br = new BufferedReader(new FileReader(fileLoc));
- String ip = "";
- HashMap<String, FileWriter> fileWriters = new HashMap<String, FileWriter>();
- while ((ip = br.readLine()) != null) {
- int tmp = Math.abs(ip.hashCode() % 1024);
- String fileName = fileLoc + tmp + ".txt";
- FileWriter fw = null;
- if (fileWriters.containsKey(fileName)) {
- fw = fileWriters.get(fileName);
- } else {
- fw = new FileWriter(fileName, true);
- fileWriters.put(fileName, fw);
- }
- fw.write(ip + "\n");
- }
- br.close();
- for (FileWriter ff : fileWriters.values()) {
- ff.close();
- }
- }
-
-
-
-
-
-
- private static void generateFile() throws IOException {
- FileWriter fw = new FileWriter(fileLoc, true);
- for (int i = 0; i < 100000000; i++) {
- for (int j = 0; j < 100000000; j++) {
- fw.write(generateIp() + "\n");
- }
- }
- fw.close();
- System.out.println("done");
- }
-
-
-
-
-
-
- private static String generateIp() {
- String ip = "";
- for (int i = 0; i < 4; i++) {
- int temp = (int) (Math.random() * 255);
- ip += temp + ".";
- }
- return ip.substring(0, ip.length() - 1);
- }
-
- public static void main(String[] args) {
- try {
- findIp();
- } catch (Exception e) {
-
- e.printStackTrace();
- }
- }
-
- }