OpenRTMFP/Cumulus开发笔记(7) Cumulus大数据处理实例(续)

来源：互联网发布：影子写手知乎编辑：程序博客网时间：2024/05/16 14:51

前一篇啰啰嗦嗦，已经把相关原理叙述了，这里把源码附上，如下：

一.HashMpq.h头文件如下：

#ifndef HASHMPQ_H_
#define HASHMPQ_H_
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <string>
#define MAXFILENAME 25
#define MAXTABLELEN 1024
typedef   struct
{
    long nHashA;
    long nHashB;
    bool bExists;
    char test_filename[MAXFILENAME];
    int count;
} MPQHASHTABLE;
class HashMpq
{
public :

    HashMpq( const   long nTableLength = MAXTABLELEN )
    {
        prepareCryptTable();
        m_tablelength = nTableLength;
        m_HashIndexTable = new MPQHASHTABLE[nTableLength];
        reset(nTableLength);
    }
    void reset(const long nTableLength);
    void prepareCryptTable();
    unsigned long HashString(std::string lpszFileName, unsigned long dwHashType);
    long GetHashTablePos(std::string lpszString);
    bool SetHashTable(std::string lpszString);
    unsigned long GetTableLength( void );
    void SetTableLength(const unsigned long nLength );
    ~HashMpq()
    {
        if ( NULL != m_HashIndexTable )
        {
            delete []m_HashIndexTable;
            m_HashIndexTable = NULL;
            m_tablelength = 0;
        }
    }
public :
    MPQHASHTABLE *m_HashIndexTable;
private :
    unsigned long cryptTable[0x500];
    unsigned long m_tablelength;
};
#endif /* HASHMPQ_H_ */

转载请注明出处：山水间博客，http://blog.csdn.net/linyanwen99/article/details/8183120

二.HashMpq.cpp实现文件，如下：

#include "windows.h"
#include "HashMpq.h"
#include <ctype.h>
void HashMpq::reset(const long nTableLength) {
   for (int i = 0; i < nTableLength; i++) {
       m_HashIndexTable[i].nHashA = -1;
       m_HashIndexTable[i].nHashB = -1;
       m_HashIndexTable[i].bExists = false;
       m_HashIndexTable[i].test_filename[0] = '\0';
       m_HashIndexTable[i].count = 0;
   }
}
void HashMpq::prepareCryptTable()
{
    unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;
    for( index1 = 0; index1 < 0x100; index1++ )
    {
        for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )
        {
            unsigned long temp1, temp2;
            seed = (seed * 125 + 3) % 0x2AAAAB;
            temp1 = (seed & 0xFFFF) << 0x10;
            seed = (seed * 125 + 3) % 0x2AAAAB;
            temp2 = (seed & 0xFFFF);
            cryptTable[index2] = ( temp1 | temp2 );
        }
    }
}
unsigned long HashMpq::HashString(std::string lpszFileName, unsigned long dwHashType)
{
    unsigned char *key = (unsigned char *)lpszFileName.c_str();
    unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
    int ch;
    while(*key != 0)
    {
        ch = toupper(*key++);
        seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
        seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;
    }
    return seed1;
}
long HashMpq::GetHashTablePos(std::string lpszString)
{
    const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
    unsigned long nHash = HashString(lpszString, HASH_OFFSET);
    unsigned long nHashA = HashString(lpszString, HASH_A);
    unsigned long nHashB = HashString(lpszString, HASH_B);
    unsigned long nHashStart = nHash % m_tablelength;
    unsigned long nHashPos = nHashStart;
    while ( m_HashIndexTable[nHashPos].bExists)
    {
        if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB)
        {
           return nHashPos;
        }
        else
            nHashPos = (nHashPos + 1) % m_tablelength;
        if (nHashPos == nHashStart)
            break;
    }
    return -1;
}
bool HashMpq::SetHashTable(std::string lpszString )
{
    const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
    unsigned long nHash = HashString(lpszString, HASH_OFFSET);
    unsigned long nHashA = HashString(lpszString, HASH_A);
   unsigned long nHashB = HashString(lpszString, HASH_B);
   unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart;
   while (m_HashIndexTable[nHashPos].bExists) {
       /*

TODO，判断该IP地址是否已经存在，存在的话，只需在原来的基础上加1即可，这里略去

        nHashPos = (nHashPos + 1) % m_tablelength;
       if (nHashPos == nHashStart) {
           return false;
       }
   }
   m_HashIndexTable[nHashPos].bExists = true;
   m_HashIndexTable[nHashPos].nHashA = nHashA;
   m_HashIndexTable[nHashPos].nHashB = nHashB;
    strcpy( m_HashIndexTable[nHashPos].test_filename, lpszString.c_str());
    m_HashIndexTable[nHashPos].count = 1;
    return true;
}
unsigned long HashMpq::GetTableLength(void)
{
    return m_tablelength;
}
void HashMpq::SetTableLength( const unsigned long nLength )
{
    m_tablelength = nLength;
    return;
}

转载请注明出处：山水间博客，http://blog.csdn.net/linyanwen99/article/details/8183120

三.ConstructBigData.h头文件，如下：

#ifndef CONSTRUCTBIGDATA_H_
#define CONSTRUCTBIGDATA_H_
#include <string>
#include <map>
#include "HashMpq.h"
class ConstructBigData{
public:
   ConstructBigData(){}
   ConstructBigData(int hashlen):mpq(hashlen),hashMpqLen(hashlen){}
   ~ConstructBigData(){}
public:
   void constructIps(char* fileName);
   void constructIps(std::string fileName);
   void filePartition(std::string fileName);
   void printMpq();
   void findMax();
   void Max();
public:
   HashMpq mpq;
private:
   int hashMpqLen;
};
#endif /* CONSTRUCTBIGDATA_H_ */

四.ConstructBigData.cpp实现文件，如下：

#include "ConstructBigData.h"
#include <fstream>
#include <sstream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <iostream>
#include <map>
void ConstructBigData::constructIps(char* fileName){
   std::ofstream outfile(fileName,std::ios::out);
   outfile << "127.0.0.1";
   outfile.flush();
   outfile.close();
}

void ConstructBigData::constructIps(std::string fileName){
   std::ofstream outfile(fileName.c_str(), std::ios::out);
   std::stringstream ip("");
   unsigned short num = 0;
   srand((unsigned) time(NULL));
   for (int i = 0; i < 9000000; ++i) {
       for (int j = 0; j < 4; ++j) {
           num = (rand() % 256);
           ip << num;
           if (j < 3)
               ip << '.';
           else
               ip << '\n';
       }
       outfile << ip.str();
       ip.str("");
       outfile.flush();
   }
   outfile.close();
}

void ConstructBigData::filePartition(std::string fileName){
   std::ifstream infile(fileName.c_str(),std::ios::in);
   std::ofstream outfile0("outfile0.txt",std::ios::out);
   std::ofstream outfile1("outfile1.txt",std::ios::out);
   std::ofstream outfile2("outfile2.txt",std::ios::out);
   std::ofstream outfile3("outfile3.txt",std::ios::out);
   std::ofstream outfile4("outfile4.txt",std::ios::out);
   if(!infile){
       return;
   }
   unsigned short val1,val2,val3,val4;
   unsigned char ch1,ch2,ch3;
   unsigned long ipval = 0;
   int modval = 0;
   std::stringstream ss;
   std::string buffer;
   std::stringstream ssbuf("");
   while (!infile.eof()) {
       getline(infile,buffer);
       ssbuf<<buffer;
       if(!infile.eof()){
           ssbuf >> val1 >> ch1 >> val2 >> ch2 >> val3 >> ch3 >> val4;
           ipval = (((((val1<<8) + val2)<<8)+val3)<<8)+val4;
           modval = ipval % 5;
           switch(modval){
           case 0:
               outfile0 << ssbuf.str() << '\n';
               break;
           case 1:
               outfile1 << ssbuf.str() << '\n';
               break;
           case 2:
               outfile2 << ssbuf.str() << '\n';
               break;
           case 3:
               outfile3 << ssbuf.str() << '\n';
               break;
           case 4:
               outfile4 << ssbuf.str() << '\n';
               break;
           default:
               std::cout<<"sb"<<std::endl;
               break;
           }
           ipval = 0;
       }
       ssbuf.clear();
       ssbuf.str("");
   }
   outfile0.flush();
   outfile1.flush();
   outfile2.flush();
   outfile3.flush();
   outfile4.flush();

   outfile0.close();
   outfile1.close();
   outfile2.close();
   outfile3.close();
   outfile4.close();
   infile.close();
}

void ConstructBigData::printMpq(){
   for(int i=0;i<hashMpqLen;++i){
       if(mpq.m_HashIndexTable[i].bExists){
           if(mpq.m_HashIndexTable[i].count > 1)
           printf("%s,%d\n",mpq.m_HashIndexTable[i].test_filename,mpq.m_HashIndexTable[i].count);
       }
   }
}

void ConstructBigData::findMax(){
   int fileNum = 5;
   std::stringstream ss("");
   for(int i=0;i<fileNum;++i){
       std::string fileName = "outfile";
       std::string suffix = ".txt";
       ss<<fileName<<i<<suffix;
       std::ifstream infile(ss.str().c_str(),std::ios::in);
       std::string buffer;
       while (!infile.eof()) {
               getline(infile,buffer);
               if(!infile.eof()){
                   mpq.SetHashTable(buffer);
               }
       }
       infile.close();
       printf("from %s->",ss.str().c_str());
       Max();
       ss.clear();
       ss.str("");
   }
}

void ConstructBigData::Max(){
   int index = 0;
   for (int i = 0; i < hashMpqLen; ++i) {
       if (mpq.m_HashIndexTable[i].bExists) {
           if(mpq.m_HashIndexTable[i].count > mpq.m_HashIndexTable[index].count){
               index = i;
           }
       }
   }
   if(mpq.m_HashIndexTable[index].bExists){
       printf("%s,%d\n",mpq.m_HashIndexTable[index].test_filename,mpq.m_HashIndexTable[index].count);
   }
   mpq.reset(hashMpqLen);
}

五.main.cpp实现文件，如下：

#include "ConstructBigData.h"
#include <iostream>
#include <stdio.h>
int main(int argc,char** argv){
   ConstructBigData bd(10000000);
   std::string fileName = "bigdata.txt";
   bd.constructIps(fileName);
   bd.filePartition(fileName);
   bd.findMax();
   return 0;
}

说明：关于运行时间什么的，这里懒得做了，有空的时候再加上吧。

PS:初写文章，文笔生涩之处，各位请见谅，若有疑问或者交流的，可加本人YY号：301558660

转载请注明出处：山水间博客，http://blog.csdn.net/linyanwen99/article/details/8183120