Java高效排序大文件

来源:互联网 发布:铝合金型材批发 知乎 编辑:程序博客网 时间:2024/05/22 17:25
package com.felix;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.util.concurrent.atomic.AtomicInteger;


/**
 * This a main class is for the final competition.</br>
 * 
 * 256M * 1024 = 262144 BYTE
 * 
 * 900 * 4 = 3600 Byte
 * 
 * 262144 - 3600 = 258544 byte
 * 
 * 
 * 
 * 
 * @author Felix
 * 
 */


public class SorterFinal {


/**
* this is not a realize size, just approximately.
*/
private static final int MAX_BYTE_COUNT_OF_LINE = 128;

/**
* the size of buffer,
* which is used to storage the original data from file.
*/
private static final int MAX_BYTE_COUNT_OF_BUFFER = 200000;

/**
* the block count of file
*/
private static final byte BLOCK_COUNT = 3;


/**
* the thread count of process block data.
*/

private static final byte THREAD_COUNT = 8;

private static final byte LEN_OF_ITEM = 8;

/**
* it is a data buffer,
* which is used to storage the data which has been decoded.
*/
private static int[][] data = new int[9000][];


/**
* initial a empty line for first line.
*/
static{
data[0] = new int[LEN_OF_ITEM];
}

private static AtomicInteger currentRowIndex = new AtomicInteger(0); 

private static final byte[] COL_INDEX_MAP = {
0, 1, 4, 5, 6, 7, 8
};





public void call(File inputFile, File outputFile, File tempDir) 
throws Exception {
long fileLen = inputFile.length();
long blockSize = fileLen / BLOCK_COUNT;
int blockBufferSize = MAX_BYTE_COUNT_OF_BUFFER / BLOCK_COUNT;
FileInputStream fi = new FileInputStream(inputFile);
FileChannel fc = fi.getChannel();




}


/**
* Get the end offset which is a end of line of a block with a specified end
* offset,

* @param fc
* @param end
* @return 
* the next index of block end offset index.
* @throws IOException
*/
private int getBlockEndOffset(FileChannel fc, int end) 
throws IOException {
ByteBuffer buffer = fc.map(MapMode.READ_ONLY, end,
MAX_BYTE_COUNT_OF_LINE);
for ( ; '\n' != buffer.get(); end++);
return end;
}


/**
 * Get the end offset which is a end of line of block with a specified end offset.
 * @param biData
 * block binary data.
 * @param end
 * the offset of end.
 * @return
 * @throws IOException
 */
private int getBlockEndOffset(byte[] biData, int end) 
throws IOException {
for ( ; '\n' != biData[end]; end++);
return end;
}


/**
* Process the binary data of a block by a multi-thread method.

* @param fi
* the file input stream of input file.
* @param bufferSize
* the size of buffer which is used to storage the original data.
* @param start
* the start offset in the file of the block.
* @param end
* the end offset in the file of the block.
* @throws IOException
*/
private void processBlockData(FileInputStream fi, int bufferSize,
int start, int end) throws IOException{
ByteBuffer buffer = ByteBuffer.allocateDirect(
bufferSize);
fi.skip(start);
FileChannel fc = fi.getChannel();

int _len = fc.read(buffer);
int _blockSize = _len / THREAD_COUNT;

byte[] biData = buffer.array();
int _end = getBlockEndOffset(biData, _blockSize);

}


public static void main(String args[]) {


}



class ParseTask implements Runnable{

private byte[] biData;
private int start;
private int end;
@Override
public void run() {

int colStart = 0;
int col = 0;
int colLen = 0;


for (int index = start; index < end ; index++){
byte c = biData[index];

if('\n' == c){
createNewLine();
}

if( ',' == c || '\r' == c ) {
if( col == 0 || col == 2){
processInt(col, colStart, colLen);
}

if( col == 1){
processString(col, colStart, colLen);
}

if(col == 3 || col == 4){
processDate(col, colStart, colLen);
}

if(col == 5){
processInt(col, colStart, colLen);
}

}
}

}


private void processDate(int col, int colStart, int len) {
int currentRow = currentRowIndex.get();
data[currentRow] = new int[LEN_OF_ITEM];
data[currentRow][COL_INDEX_MAP[col]] = Utils.date2Int(biData, colStart);

}


private void processString(int col, int colStart, int len) {

int currentRow = currentRowIndex.get();
data[currentRow] = new int[LEN_OF_ITEM];
int _len = len;
for(int i = 0; i < 3; i++){

data[currentRow][COL_INDEX_MAP[col] + i] = Utils.byte2Int(
biData, colStart + i * 8, _len >= 4 ? 4 : _len);
_len -= 4;
}

}

private void createNewLine(){
int currentRow = currentRowIndex.getAndIncrement();
data[currentRow] = new int[LEN_OF_ITEM];
}


private void processInt(int col, int colStart, int len) {

int currentRow = currentRowIndex.get();
data[currentRow][COL_INDEX_MAP[col]] = Utils.byte2Int(
biData, colStart, len);

}


}

}








/*=====================*/




package com.felix;


/**
 * this class provide some useful util.
 * 
 * @author Felix
 * 
 */
public class Utils {


/**
* Convert a byte array which is specified a length to int.

* @param array
* @param len
* @return
*/


static int byte2Int(byte[] array,int offset, int len) {
int sign = 1;
int start = 0;


if ('-' == array[offset]) {
sign = -1;
start = 1 + offset;
}


int value = (array[start] & 0x0f);


for (int i = start + 1; i < len; i++) {

value = ((value << 3) + 
(value << 1) + (array[i] & 0x0f) );
}
return sign * value;


}
/**
* Convert a byte array which is specified a length to int.

* @param array
* @param len
* @return
*/


static int byte2Int(byte[] array, int len) {
int sign = 1;
int start = 0;


if ('-' == array[0]) {
sign = -1;
start = 1;
}


int value = array[start] & 0x0f;;


for (int i = start + 1; i < len; i++) {
value = ((value << 3) + 
(value << 1) + (array[i] & 0x0f) );
}
return sign * value;


}


/**
* Convert a byte array which is specified a length to int.

* @param array
* @param len
* @return
*/
static int byte2Int(byte[] array) {
int sign = 1;
int start = 0;


if ('-' == array[0]) {
sign = -1;
start = 1;
}


int value = array[start] & 0x0f;


for (int i = start; array[i] != '\0'; i++) {
value = ((value << 3) + 
(value << 1) + (array[i] & 0x0f) );
}
return sign * value;


}
/**
* convert a date string to a int.
* E.g.

* 10/21/2014 --> 20141021

* @param array
* @param offset
* @param len
* @param separator
* @return
*/
static int date2Int(byte[] array, int offset ){

int start = offset + 6;
int value = array[start] & 0x0f;

for (int i = start + 1; i < start + 4; i++) {
value = ((value << 3) + 
(value << 1) + (array[i] & 0x0f));
}

value = ((value << 3) + 
(value << 1) + (array[offset] & 0x0f));

value = ((value << 3) + 
(value << 1) + (array[offset + 1] & 0x0f));

value = ((value << 3) + 
(value << 1) + (array[offset + 3] & 0x0f));
value = ((value << 3) + 
(value << 1) + (array[offset + 4] & 0x0f));

return value;
}

}

0 0