OpenRTMFP/Cumulus开发笔记(7) Cumulus大数据处理实例(续)

来源:互联网 发布:影子写手 知乎 编辑:程序博客网 时间:2024/05/16 14:51

前一篇啰啰嗦嗦,已经把相关原理叙述了,这里把源码附上,如下:

一.HashMpq.h头文件如下:

#ifndef HASHMPQ_H_
#define HASHMPQ_H_
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <string>
#define MAXFILENAME 25
#define MAXTABLELEN 1024
typedef   struct
{
    long  nHashA;
    long  nHashB;
    bool  bExists;
    char  test_filename[MAXFILENAME];
    int count;
} MPQHASHTABLE;
class  HashMpq
{
public :

    HashMpq( const   long  nTableLength = MAXTABLELEN )
    {
        prepareCryptTable();
        m_tablelength = nTableLength;
        m_HashIndexTable = new  MPQHASHTABLE[nTableLength];
        reset(nTableLength);
    }
    void reset(const long nTableLength);
    void  prepareCryptTable();
    unsigned long  HashString(std::string lpszFileName, unsigned long dwHashType);
    long  GetHashTablePos(std::string lpszString);
    bool  SetHashTable(std::string lpszString);
    unsigned long  GetTableLength( void );
    void  SetTableLength(const unsigned long nLength );
    ~HashMpq()
    {
        if  ( NULL != m_HashIndexTable )
        {
            delete  []m_HashIndexTable;
            m_HashIndexTable = NULL;
            m_tablelength = 0;
        }
    }
public :
    MPQHASHTABLE *m_HashIndexTable;
private :
    unsigned long  cryptTable[0x500];
    unsigned long  m_tablelength;
};
#endif /* HASHMPQ_H_ */

转载请注明出处:山水间博客,http://blog.csdn.net/linyanwen99/article/details/8183120

二.HashMpq.cpp实现文件,如下:

#include "windows.h"
#include "HashMpq.h"
#include <ctype.h>
void HashMpq::reset(const long nTableLength) {
    for (int i = 0; i < nTableLength; i++) {
        m_HashIndexTable[i].nHashA = -1;
        m_HashIndexTable[i].nHashB = -1;
        m_HashIndexTable[i].bExists = false;
        m_HashIndexTable[i].test_filename[0] = '\0';
        m_HashIndexTable[i].count = 0;
    }
}
void HashMpq::prepareCryptTable()
{
    unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;
    for( index1 = 0; index1 < 0x100; index1++ )
    {
        for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )
        {
            unsigned long temp1, temp2;
            seed = (seed * 125 + 3) % 0x2AAAAB;
            temp1 = (seed & 0xFFFF) << 0x10;
            seed = (seed * 125 + 3) % 0x2AAAAB;
            temp2 = (seed & 0xFFFF);
            cryptTable[index2] = ( temp1 | temp2 );
        }
    }
}
unsigned long HashMpq::HashString(std::string lpszFileName, unsigned long dwHashType)
{
    unsigned char *key = (unsigned char *)lpszFileName.c_str();
    unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
    int ch;
    while(*key != 0)
    {
        ch = toupper(*key++);
        seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
        seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;
    }
    return seed1;
}
long HashMpq::GetHashTablePos(std::string lpszString)
{
    const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
    unsigned long nHash = HashString(lpszString, HASH_OFFSET);
    unsigned long nHashA = HashString(lpszString, HASH_A);
    unsigned long nHashB = HashString(lpszString, HASH_B);
    unsigned long nHashStart = nHash % m_tablelength;
    unsigned long nHashPos = nHashStart;
    while ( m_HashIndexTable[nHashPos].bExists)
    {
        if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB)
        {
            return nHashPos;
        }
        else
            nHashPos = (nHashPos + 1) % m_tablelength;
        if (nHashPos == nHashStart)
            break;
    }
    return -1;
}
bool HashMpq::SetHashTable(std::string lpszString )
{
    const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
    unsigned long nHash = HashString(lpszString, HASH_OFFSET);
    unsigned long nHashA = HashString(lpszString, HASH_A);
    unsigned long nHashB = HashString(lpszString, HASH_B);
    unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart;
    while (m_HashIndexTable[nHashPos].bExists) {
       /*

        TODO,判断该IP地址是否已经存在,存在的话,只需在原来的基础上加1即可,这里略去

        */

        nHashPos = (nHashPos + 1) % m_tablelength;
        if (nHashPos == nHashStart) {
            return false;
        }
    }
    m_HashIndexTable[nHashPos].bExists = true;
    m_HashIndexTable[nHashPos].nHashA = nHashA;
    m_HashIndexTable[nHashPos].nHashB = nHashB;
    strcpy( m_HashIndexTable[nHashPos].test_filename, lpszString.c_str());
    m_HashIndexTable[nHashPos].count = 1;
    return true;
}
unsigned long HashMpq::GetTableLength(void)
{
    return m_tablelength;
}
void HashMpq::SetTableLength( const unsigned long nLength )
{
    m_tablelength = nLength;
    return;
}

转载请注明出处:山水间博客,http://blog.csdn.net/linyanwen99/article/details/8183120

三.ConstructBigData.h头文件,如下:

#ifndef CONSTRUCTBIGDATA_H_
#define CONSTRUCTBIGDATA_H_
#include <string>
#include <map>
#include "HashMpq.h"
class ConstructBigData{
public:
    ConstructBigData(){}
    ConstructBigData(int hashlen):mpq(hashlen),hashMpqLen(hashlen){}
    ~ConstructBigData(){}
public:
    void constructIps(char* fileName);
    void constructIps(std::string fileName);
    void filePartition(std::string fileName);
    void printMpq();
    void findMax();
    void Max();
public:
    HashMpq mpq;
private:
    int hashMpqLen;
};
#endif /* CONSTRUCTBIGDATA_H_ */

四.ConstructBigData.cpp实现文件,如下:

#include "ConstructBigData.h"
#include <fstream>
#include <sstream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <iostream>
#include <map>
void ConstructBigData::constructIps(char* fileName){
    std::ofstream outfile(fileName,std::ios::out);
    outfile << "127.0.0.1";
    outfile.flush();
    outfile.close();
}

void ConstructBigData::constructIps(std::string fileName){
    std::ofstream outfile(fileName.c_str(), std::ios::out);
    std::stringstream ip("");
    unsigned short num = 0;
    srand((unsigned) time(NULL));
    for (int i = 0; i < 9000000; ++i) {
        for (int j = 0; j < 4; ++j) {
            num = (rand() % 256);
            ip << num;
            if (j < 3)
                ip << '.';
            else
                ip << '\n';
        }
        outfile << ip.str();
        ip.str("");
        outfile.flush();
    }
    outfile.close();
}

void ConstructBigData::filePartition(std::string fileName){
    std::ifstream infile(fileName.c_str(),std::ios::in);
    std::ofstream outfile0("outfile0.txt",std::ios::out);
    std::ofstream outfile1("outfile1.txt",std::ios::out);
    std::ofstream outfile2("outfile2.txt",std::ios::out);
    std::ofstream outfile3("outfile3.txt",std::ios::out);
    std::ofstream outfile4("outfile4.txt",std::ios::out);
    if(!infile){
        return;
    }
    unsigned short val1,val2,val3,val4;
    unsigned char ch1,ch2,ch3;
    unsigned long ipval = 0;
    int modval = 0;
    std::stringstream ss;
    std::string buffer;
    std::stringstream ssbuf("");
    while (!infile.eof()) {
        getline(infile,buffer);
        ssbuf<<buffer;
        if(!infile.eof()){
            ssbuf >> val1 >> ch1 >> val2 >> ch2 >> val3 >> ch3 >> val4;
            ipval = (((((val1<<8) + val2)<<8)+val3)<<8)+val4;
            modval = ipval % 5;
            switch(modval){
            case 0:
                outfile0 << ssbuf.str() << '\n';
                break;
            case 1:
                outfile1 << ssbuf.str() << '\n';
                break;
            case 2:
                outfile2 << ssbuf.str() << '\n';
                break;
            case 3:
                outfile3 << ssbuf.str() << '\n';
                break;
            case 4:
                outfile4 << ssbuf.str() << '\n';
                break;
            default:
                std::cout<<"sb"<<std::endl;
                break;
            }
            ipval = 0;
        }
        ssbuf.clear();
        ssbuf.str("");
    }
    outfile0.flush();
    outfile1.flush();
    outfile2.flush();
    outfile3.flush();
    outfile4.flush();

    outfile0.close();
    outfile1.close();
    outfile2.close();
    outfile3.close();
    outfile4.close();
    infile.close();
}

void ConstructBigData::printMpq(){
    for(int i=0;i<hashMpqLen;++i){
        if(mpq.m_HashIndexTable[i].bExists){
            if(mpq.m_HashIndexTable[i].count > 1)
            printf("%s,%d\n",mpq.m_HashIndexTable[i].test_filename,mpq.m_HashIndexTable[i].count);
        }
    }
}

void ConstructBigData::findMax(){
    int fileNum = 5;
    std::stringstream ss("");
    for(int i=0;i<fileNum;++i){
        std::string fileName = "outfile";
        std::string suffix = ".txt";
        ss<<fileName<<i<<suffix;
        std::ifstream infile(ss.str().c_str(),std::ios::in);
        std::string buffer;
        while (!infile.eof()) {
                getline(infile,buffer);
                if(!infile.eof()){
                    mpq.SetHashTable(buffer);
                }
        }
        infile.close();
        printf("from %s->",ss.str().c_str());
        Max();
        ss.clear();
        ss.str("");
    }
}

void ConstructBigData::Max(){
    int index = 0;
    for (int i = 0; i < hashMpqLen; ++i) {
        if (mpq.m_HashIndexTable[i].bExists) {
            if(mpq.m_HashIndexTable[i].count > mpq.m_HashIndexTable[index].count){
                index = i;
            }
        }
    }
    if(mpq.m_HashIndexTable[index].bExists){
        printf("%s,%d\n",mpq.m_HashIndexTable[index].test_filename,mpq.m_HashIndexTable[index].count);
    }
    mpq.reset(hashMpqLen);
}

五.main.cpp实现文件,如下:

#include "ConstructBigData.h"
#include <iostream>
#include <stdio.h>
int main(int argc,char** argv){
    ConstructBigData bd(10000000);
    std::string fileName = "bigdata.txt";
    bd.constructIps(fileName);
    bd.filePartition(fileName);
    bd.findMax();
    return 0;
}

说明:关于运行时间什么的,这里懒得做了,有空的时候再加上吧。

PS:初写文章,文笔生涩之处,各位请见谅,若有疑问或者交流的,可加本人YY号:301558660

转载请注明出处:山水间博客,http://blog.csdn.net/linyanwen99/article/details/8183120





原创粉丝点击