大数据处理---C++

来源:互联网 发布:格洛纳斯 北斗知乎 编辑:程序博客网 时间:2024/05/16 09:02

一个大文件,肯定是内存无法全部读取的,比如1TB,里边很多字段,假设是这样的:

1

1

1

这样一行一个数字,简单点,不统计每个数字的频率了,仅仅统计有多少个1,如何做呢?

FILE * p = fopen("test.dat","r");__int64 len=0;char a[2]={0};for(int i = 0 ;; i ++){fread(a,2,1,p);if(a[0]!='1')break;len =i*1024;}printf("%s  %d\n",a,len);fclose(p);p = NULL;
之前以为fopen是将文件全部加载进内存后再操作,实践之后发现不是这样,它可以一点点向后读取,但是要是太大,直接就seek到文件结尾,还是会异常、崩溃,这时候使用虚拟内存映射来处理:

#include <stdio.h>#include <Windows.h>void createBigFile(int NumberGB=0){FILE * p = fopen("d:\\test.dat","w+");for(int i = 0 ; i < NumberGB ; i ++){fwrite("1\n",2,1,p);}fclose(p);p = NULL;}void CountBigFile0(){FILE * p = fopen("test.dat","r");__int64 len=0;char a[2]={0};for(int i = 0 ; i < 2992742400 ; i ++){fseek(p,i*1024,SEEK_SET);fread(a,2,1,p);if(a[0]!='1')break;len =i*1024;}printf("%s  %d\n",a,len);fclose(p);p = NULL;}__int64 CountBigFile1(void){__int64 count = 0;SYSTEM_INFO sys;GetSystemInfo(&sys);HANDLE hFile = CreateFile(TEXT("d:\\test.dat"),FILE_READ_DATA|FILE_WRITE_DATA,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);if(hFile == INVALID_HANDLE_VALUE) return 0;HANDLE hFileMapping = CreateFileMapping(hFile,NULL,PAGE_READWRITE | SEC_COMMIT,0,0,NULL);DWORD dwFileHigh;__int64 qwFileSize = GetFileSize(hFile,&dwFileHigh);qwFileSize += ( ((__int64)dwFileHigh)<<32);CloseHandle(hFile);__int64 qwFileOffset = 0;while (qwFileSize > 0){DWORD dwBytesInblock = sys.dwAllocationGranularity;if(qwFileSize < dwBytesInblock)dwBytesInblock = qwFileSize;PBYTE pbFile = (PBYTE) MapViewOfFile(hFileMapping,FILE_MAP_WRITE|FILE_MAP_READ,(DWORD)qwFileOffset>>32,(DWORD)(qwFileOffset&0xffffffff),dwBytesInblock);memcpy(pbFile,"12345\n",6);//for(int i = 0 ; i < dwBytesInblock ; i ++)//{////doing//count ++;//记录行数//}//BOOL bRet = ::FlushViewOfFile(pbFile, dwBytesInblock);//if ( bRet == FALSE )//{//DWORD dwError = GetLastError();//return FALSE;//}//UnmapViewOfFile(pbFile);qwFileOffset+=dwBytesInblock;qwFileSize-=dwBytesInblock;}printf("count = %I64d\n",qwFileSize);CloseHandle(hFileMapping);return count;}int main(){createBigFile(20);CountBigFile1();return 0;}


结果是一样的,很棒的大数据方法。

摘自图书:Windows via CC++ from vm 

0 0
原创粉丝点击