给40亿个不重复的unsigned int的整数,没排过序的,然后再给一个数,如何快速判断这个数是否在那40亿个数当中?

来源:互联网 发布:云墙vpn端口不可用 编辑:程序博客网 时间:2024/06/03 14:41
总共有4*10^9个数字,如果直接放在内存中,需要的内存量是
4*10^9*4bytes(一个unsigned int为4bytes) ~= 16GBytes内存
用bitmap表示就是4*10^9bits~=512MBytes=2^32 = 4 294 967 296 ,所以索性就分配512M内存,然后依次将这个40亿不重复的数字添加到bitmap中,默认为0,添加后为1。最后直接判断要查找的数字的bitmap值,如果为0,则表示没有,如果为1,则表示有。

#include <stdio.h>#include <stdlib.h>#include <string.h>#define TOTAL_DATA_SIZE 300 //4000000000  /*40亿个不重复的unsigned int整数大数据*///#define MAX_BUFFER_SIZE 536870912   /*512MB*/typedef char bitmap_type;#define BITMAP_BITS (sizeof(bitmap_type)*8)#define MAX_BUFFER_SIZE ((TOTAL_DATA_SIZE+BITMAP_BITS-1)/BITMAP_BITS)#define bitmap_set(b,v) (b[(v)/BITMAP_BITS] |= (1 << ((v)%BITMAP_BITS)))#define bitmap_clear(b,v) (b[(v)/BITMAP_BITS] &= ~(1 << ((v)%BITMAP_BITS)))#define bitmap_isset(b,v) (b[(v)/BITMAP_BITS] & (1 << ((v)%BITMAP_BITS)))#define bitmap_zero(b) memset(b,0,sizeof(bitmap_type)*MAX_BUFFER_SIZE)#define DATA_FILE_NAME "data.txt"int main(int argc, char*argv[]){    bitmap_type *bitmap = NULL;    FILE *fp;    unsigned int num;    bitmap = (bitmap_type*)malloc(sizeof(bitmap_type)*MAX_BUFFER_SIZE);    if(bitmap == NULL) {        printf("bitmap malloc error\n");        return -1;    }    bitmap_zero(bitmap);    //memset(bitmap,0,sizeof(bitmap_type)*MAX_BUFFER_SIZE);    fp = fopen(DATA_FILE_NAME,"rb");    if(fp == NULL) {        printf("fopen file %s error\n",DATA_FILE_NAME);        return -1;    }    while(fscanf(fp,"%d",&num) != EOF) {        bitmap_set(bitmap,num);    }    if(bitmap_isset(bitmap,2)) {        printf("2 is set in the bitmap\n");    }    if(bitmap_isset(bitmap,222)) {        printf("222 is set in the bitmap\n");    }else {        printf("222 is not in the bitmap\n");    }    if(bitmap_isset(bitmap,333)) {        printf("333 is set in the bitmap\n");    }else {        printf("333 is not in the bitmap\n");    }    return 0;}





[root@localhost c_language]# more data.txt
1
2
3
222
5
6
7
8

0 0