大规模数据分析-HW2 part2 GraphLigte相关操作

来源：互联网发布：linux 红旗价钱编辑：程序博客网时间：2024/06/05 14:44

第一步：Git下载GraphLite到ubuntu系统

$ wget -r -O GraphLite-master.zip "https://codeload.github.com/schencoding/GraphLite/zip/master"

解压下载的文件到/home/hadoop/GraphLite-0.20

$ unzip GraphLite-master.zip

第二步：配置环境变量

$ sudo vim /etc/profile#添加以下内容# GraphLite export GRAPHLITE_HOME=/home/hadoop/GraphLite-0.20export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64:$JAVA_HOME/jre/lib/amd64/server#并更新环境变量$ source /etc/profile

第三步：下载相关支持

$ sudo apt-get install protobuf-c-compiler libprotobuf-c0 libprotobuf-c0-dev$ sudo apt-get install make$ sudo apt-get install g++

第四步：设置GraphLite-0.20文件下的JAVA_HOME, HADOOP_HOME, GRAPHLITE_HOME

$ vim bin/setenv#添加内容export JAVA_HOME=/usr/local/java/jdk1.8.0_121export HADOOP_HOME=/home/hadoop/hadoop-2.7.3export GRAPHLITE_HOME=/home/hadoop/GraphLite-0.20export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64:$JAVA_HOME/jre/lib/amd64/server

第五步：在engine/目录下make工程

$ cd engine$ make

第六步：在example/目录下make工程

报如下错误

g++ -std=c++0x -g -O2 -I/usr/local/hadoop-2.7.3/include -I/usr/local/563home/GraphLite-0.20/include PageRankVertex.cc -fPIC -shared -o PageRankVertex.soPageRankVertex.cc:33:23: fatal error: GraphLite.h: 没有那个文件或目录 #include "GraphLite.h"                       ^compilation terminated.make: *** [PageRankVertex.so] 错误 1

找不到GraphLite.h目录：检查环境变量是 GRAPHLITE_HOME写错了

第七步：cd 到GraphLite-0.20根目录下执行命令

$ start-graphlite example/PageRankVertex.so Input/facebookcombined_4w Output/out

注意在每次开启shell并运行example时都要先执行指令：

$ ./bin/setenv

第八步：我对PageRankVertex.cc代码的理解

#include <stdio.h>#include <string.h>#include <math.h>#include "GraphLite.h"#define VERTEX_CLASS_NAME(name) PageRankVertex##name  //宏定义，不解释#define EPS 1e-6    // PageRank结束的deltaclass VERTEX_CLASS_NAME(InputFormatter): public InputFormatter {/* 对输入文件内容的读取类*/public:    int64_t getVertexNum() {        unsigned long long n;        sscanf(m_ptotal_vertex_line, "%lld", &n);        m_total_vertex= n;        return m_total_vertex;    }    int64_t getEdgeNum() {        unsigned long long n;        sscanf(m_ptotal_edge_line, "%lld", &n);        m_total_edge= n;        return m_total_edge;    }    int getVertexValueSize() {        m_n_value_size = sizeof(double);        return m_n_value_size;    }    int getEdgeValueSize() {        m_e_value_size = sizeof(double);        return m_e_value_size;    }    int getMessageValueSize() {        m_m_value_size = sizeof(double);        return m_m_value_size;    }    // 该函数被我修改过（为了代码简洁）    void loadGraph() {        unsigned long long last_vertex;        unsigned long long from;        unsigned long long to;        int outdegree = 0;     //节点的出度        double value = 0.0;    //节点的权重        double weight = 1.0;   //边权重        /* 因为文件的存取是from有序排列并在同一个文件中           所以添加Vertex时是按照from添加（因为不会重复添加）        */        for (int64_t i = 0; i < m_total_edge; i++) {            const char *line= getEdgeLine();            sscanf(line, "%lld %lld", &from, &to); //注意读取文件的读取格式            if ( i != 0 && last_vertex != from) {                addVertex(last_vertex, &value, outdegree);                last_vertex = from;                outdegree = 1;            } else if( i == 0 ){                last_vertex = from;                outdegree = 1;            } else{                outdegree++;            }            addEdge(from, to, &weight);        }        addVertex(last_vertex, &value, outdegree);    }};class VERTEX_CLASS_NAME(OutputFormatter): public OutputFormatter {/* 将节点的信息输出到目标文件*/public:    // 只需重写该方法即可    void writeResult() {        int64_t vid;        double value;        char s[1024];        for (ResultIterator r_iter; ! r_iter.done(); r_iter.next() ) {            r_iter.getIdValue(vid, &value);            int n = sprintf(s, "%lld: %f\n", (unsigned long long)vid, value);            writeNextResLine(s, n);        }    }};// An aggregator that records a double value tom compute sumclass VERTEX_CLASS_NAME(Aggregator): public Aggregator<double> {/* 定义所有的Aggregator类    我的理解是Master有一个超级Aggregator来维护m_global( 存放所有图节点的delta PageRank值 )    每个Worker有一个Aggregator来维护超级m_local值，即该Worker下对应Verter节点的 delta PageRank值    每个Worker中的Vertex类有一个Aggregator来维护对应图节点的delta PageRank值。    */public:    void init() {        /* 每个超步内Vertex会新建一个Aggregator,并在超步结束时销毁。           每个超步内Worker会新建一个Aggregator,并在超步结束时销毁。           每个超步内Master会新建一个Aggregator,并在整个程序结束时销毁。        */        m_global = 0;        m_local = 0;    }    void* getGlobal() {        return &m_global;    }    void setGlobal(const void* p) {        m_global = * (double *)p;    }    void* getLocal() {        return &m_local;    }    void merge(const void* p) {        // 每个超步结束时，Master会将所有Worker发送来的->(Worker内sum delta PageRank)累加。        m_global += * (double *)p;    }    void accumulate(const void* p) {        // 每个超步内，Worker会将所有Vertex发送来的->(delta PageRank)累加。        m_local += * (double *)p;    }};class VERTEX_CLASS_NAME(Graph): public Graph {/* 构建整个图*/public:    // 该aggergator其实就是Master中的global aggregator    VERTEX_CLASS_NAME(Aggregator)* aggregator;public:    // argv[0]: PageRankVertex.so    // argv[1]: <input path>    // argv[2]: <output path>    void init(int argc, char* argv[]) {        //申请worker的个数        setNumHosts(5);        setHost(0, "localhost", 1411);        setHost(1, "localhost", 1421);        setHost(2, "localhost", 1431);        setHost(3, "localhost", 1441);        setHost(4, "localhost", 1451);        if (argc < 3) {           printf ("Usage: %s <input path> <output path>\n", argv[0]);           exit(1);        }        m_pin_path = argv[1];        m_pout_path = argv[2];        aggregator = new VERTEX_CLASS_NAME(Aggregator)[1];        //在这里可以对aggregator中的m_global进行赋值操作        regNumAggr(1); //注册聚合器的个数        regAggr(0, &aggregator[0]); //注册聚合器    }    void term() {        delete[] aggregator;    }};class VERTEX_CLASS_NAME(): public Vertex <double, double, double> {/* 整个图的节点类*/public:    // compute函数是每个超步内每个Worker中的每个Vertex都要执行的方法。    void compute(MessageIterator* pmsgs) {        double val;        if ( getSuperstep() == 0 ) { val = 1.0; }         else {            if ( getSuperstep() >= 2 ) {                double global_val = * (double *)getAggrGlobal(0);                if (global_val < EPS) {                    voteToHalt(); return;                }            }            double sum = 0;            for ( ; ! pmsgs->done(); pmsgs->next() ) {                sum += pmsgs->getValue();            }            val = 0.15 + 0.85 * sum; // pagerank更新公式            // 以下2行是 给该Vertex中的Aggregator赋值,为->(delta PageRank)            double acc = fabs(getValue() - val);            accumulateAggr(0, &acc); //整个图聚合器的ID=0,值为acc。        }        * mutableValue() = val; //给该节点Vertex赋点权，值为val        const int64_t n = getOutEdgeIterator().size();        sendMessageToAllNeighbors(val / n);    }};/* STOP: do not change the code below. */extern "C" Graph* create_graph() {    Graph* pgraph = new VERTEX_CLASS_NAME(Graph);    pgraph->m_pin_formatter = new VERTEX_CLASS_NAME(InputFormatter);    pgraph->m_pout_formatter = new VERTEX_CLASS_NAME(OutputFormatter);    pgraph->m_pver_base = new VERTEX_CLASS_NAME();        return pgraph;}extern "C" void destroy_graph(Graph* pobject) {    delete ( VERTEX_CLASS_NAME()* )(pobject->m_pver_base);    delete ( VERTEX_CLASS_NAME(OutputFormatter)* )(pobject->m_pout_formatter);    delete ( VERTEX_CLASS_NAME(InputFormatter)* )(pobject->m_pin_formatter);    delete ( VERTEX_CLASS_NAME(Graph)* )pobject;}

第九步：补充的一些内容

1、可以将图信息分裂为k个可读文件，k的个数是XXX.cc代码中设置的Worker个数。分裂指令如下:

$ hash-partitioner.pl <File Path> <K>$ hash-partitioner.pl Input/facebookcombined 4

2、C++中的字符转换成double类型

char *x_str = "34.33333";double KCore = 0.0;sscanf(x_str,"%lf",&KCore_K); //字符串转换成double类型

3、GraphLlie程序中include头文件关系说明

0 0