2013年阿里巴巴一道笔试题(大文件处理)

来源:互联网 发布:虚拟商城软件下载 编辑:程序博客网 时间:2024/04/30 04:58

现有一个亿级别数据量的文件,其中有按key升序的记录,现要求通过输入key查找对应的记录。

对于这种大文件读取,在读取时一般要采用内存文件映射, 另外,通常的处理操作就是对文件进行分隔。 把文件分隔为若干小文件后,记录下每个小文件中最小的key值,然后把输入值与这些key值依次比较便可以找到key对应的记录所在的小文件,然后把小文件读入内存,进行二分查找。

下面是所有的程序代码,为方便,文件中仅仅记录了key。

/** 分别采用分割文件,在子文件中进行二分查找的方法 和 直接进行键值查找的方法进行时间对比*/package com.alibaba;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.EOFException;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.RandomAccessFile;import java.nio.IntBuffer;import java.nio.MappedByteBuffer;import java.nio.channels.FileChannel;import java.util.ArrayList;import java.util.TreeMap;import java.util.Map.Entry;public class SortBigFile {//单个文件大小private static final int FILE_SIZE = 1024*1024*10;//整数个数private static final int numOfInt = 100000000;//分隔文件存储目录private static final String divPath = "E:\\divide";//记录分隔文件后每个文件的最小值,及对应的文件路径private TreeMap<Integer,String> fileMap = new TreeMap<Integer,String>();//待查找的大文件路径private String filePath;public SortBigFile(String path){this.filePath = path;}//写入大文件,做测试public void writeInt(){FileChannel fc = null;IntBuffer ib = null;try {fc = new RandomAccessFile(filePath,"rw").getChannel();ib = fc.map(FileChannel.MapMode.READ_WRITE, 0, numOfInt*4).asIntBuffer();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}for(int i=0;i<numOfInt;++i)ib.put(i);try {fc.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}//对大文件进行分隔public void splitFile(){FileChannel fc = null;IntBuffer out = null;try {fc = new RandomAccessFile(filePath,"r").getChannel();out = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()).asIntBuffer();} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}//对文件进行分隔try {for(int i=0;i<fc.size()/FILE_SIZE+1;++i){String path = divPath+"\\"+System.currentTimeMillis()+".tmp";DataOutputStream dos = null;try {dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(path))));} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();}for(int j=0;j<FILE_SIZE/4;++j){if(out.hasRemaining()){int num = out.get();if(j==0){fileMap.put(num, path);}dos.writeInt(num);}}dos.close();}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}try {fc.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}//对大文件进行键值查找public String findKey(int key){int fnum;String fpath="";boolean flag = false;for(Entry<Integer,String> entry:fileMap.entrySet()){int num = entry.getKey();String path = entry.getValue();if(num==key)return path+":first num";else if(num<key){fnum = num; fpath = path;}else{flag = true;break;}}if(fpath.isEmpty()||!flag){return "find nothing";}int index = binarySearch(fpath, key);if(index==-1)return "find nothing";elsereturn fpath+":"+index+" num";}//对子文件进行二分查找private int binarySearch(String path, int key){int index = 0;ArrayList<Integer> nums = new ArrayList<Integer>();DataInputStream dos = null;try {dos = new DataInputStream(new BufferedInputStream(new FileInputStream(new File(path))));} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();}while(true){int num;try {num = dos.readInt();nums.add(num);} catch (EOFException e) {// TODO Auto-generated catch block//文件结尾break;}catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}int start =0, end = nums.size()-1;while(start<end){int mid = start + (end-start)/2;if(nums.get(mid)==key)return mid;else if(nums.get(mid)>key){end = mid;}else{start = mid;}}return -1;}//采用直接内存映射读入文件,然后进行比较得到键值的方法public int dirFind(int key){FileChannel fc = null;IntBuffer out = null;try {fc = new RandomAccessFile(filePath,"r").getChannel();out = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()).asIntBuffer();} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}int index = 0;while(out.hasRemaining()){int num = out.get();if(num==key)return index;else if(num<key){++index;}else{return -1;}}return -1;}/** * @param args */public static void main(String[] args) {// TODO Auto-generated method stubSortBigFile bigfile = new SortBigFile("E:\\test.tmp");long start = 0;long end = 0;long withoutDiv = 0;bigfile.writeInt();//计算文件分割,然后二分查找键值的时间start = System.currentTimeMillis();bigfile.splitFile();withoutDiv = System.currentTimeMillis();System.out.println(bigfile.findKey(88732723));end = System.currentTimeMillis();System.out.println("with divide file time:"+(end-start)/1000);System.out.println("without divide file time:"+(end-withoutDiv)/1000);System.out.println("---------------");//不进行分割,直接查找的时间start = System.currentTimeMillis();System.out.println("Index:"+ bigfile.dirFind(88732723));end = System.currentTimeMillis();System.out.println((end-start)/1000);}}


原创粉丝点击