python网络分析-network第一版的封装

来源:互联网 发布:mac终端c语言 编辑:程序博客网 时间:2024/06/05 16:52

本文主要是对我在写实验的时候所用到的networkx进行的一个初步的封装。其实不是很标准,现在再写第二版。先把之前的代码贴上来。主要参考的文档就是networkx的官方文档。
[networkx-reference]

我需要说明一点,下面的代码针对的是无向图

代码

下面这一部分代码是对networkx的初步封装。

  • GraphOperation.py
#-*- coding:utf-8 -*-import networkx as nximport matplotlib.pyplot as pltimport traceback'''我对networkx 的封装还是一个图操作-工具类'''class GraphOperation:    #-----------------graph operation-----------------    # construct a graph - undirected graph if default    def __init__(self):        self.graph = nx.Graph()    def convert_to_directed_graph(self):        self.graph = nx.DiGraph()    def convert_to_multi_graph(self):        self.graph = nx.MultiGraph()    # only directed graph can do this operation    def convert_to_undirected_graph(self):        self.graph = nx.Graph()    # clear the graph    def clear_graph(self):        try:            self.graph.clear()        except Exception, e:            print traceback.print_exc()    #------------------node operation----------------------------    # add a node    def add_node(self, node):        try:            self.graph.add_node(node)        except Exception,e:            print traceback.print_exc()    # add a list of nodes    def add_nodes_by_list(self, node_list):        try:            self.graph.add_nodes_from(node_list)        except Exception,e:            print traceback.print_exc()    # remove a node    def remove_node(self, node):        try:            self.graph.remove_node(node)        except Exception,e:            print traceback.print_exc()    # remove a list of nodes    def remove_nodes_by_list(self, node_list):        try:            self.graph.remove_nodes_from(node_list)        except Exception,e:            print traceback.print_exc()    # get number of nodes    def get_number_of_nodes(self):        try:            return self.graph.number_of_nodes()        except Exception, e:            print traceback.print_exc()    # get nodes, return a list of nodes    def get_nodes(self):        try:            return self.graph.nodes()        except Exception, e:            print traceback.print_exc()    # get neighbors of v, return a list of nodes which is the neighbor of v    def get_neighbors(self, v):        try:            return self.graph.neighbors(v)        except Exception, e:            print traceback.print_exc()    #---------------edge operation------------------------------    # add an edge    def add_edge(self,u,v):        try:            self.graph.add_edge(u,v)        except Exception,e:            print traceback.print_exc()    # add an edge by a tuple    def add_edge_by_tuple(self,e):        try:            self.add_edge(*e) # unpack edge tuple        except Exception,e:            print traceback.print_exc()    # add edges by list which is compromised of tuples, every tuple is an edge    def add_edges_by_list(self, edge_list):        try:            self.graph.add_edges_from(edge_list)        except Exception,e:            print traceback.print_exc()    # remove an edge    def remove_edge(self,u ,v ):        try:            self.graph.remove_edge(u, v)        except Exception,e:            print traceback.print_exc()    # remove an edge by tuple    def remove_edge_by_tuple(self, e):        try:            self.remove_edge(*e)        except Exception,e:            print traceback.print_exc()    # remove edges by list which is compromised of tuples    def remove_edges_by_list(self, edge_list):        try:            self.remove_edges_from(edge_list)        except Exception, e:            print traceback.print_exc()    # get number of edges    def get_number_of_edges(self):        try:            return self.graph.number_of_edges()        except Exception, e:            print traceback.print_exc()    # get edges, return a list of tuple which is a presentation of an edge    def get_edges(self):        try:            return self.graph.edges()        except Exception, e:            print traceback.print_exc()    # add weighted list by a list which is compromised of tuples    def add_weighted_edge(self, weighted_edge_list):        try:            self.graph.add_weighted_edges_from(weighted_edge_list)        except Exception, e:            print traceback.print_exc()    # get weighted edge    def get_weighted_edge(self):        try:            return self.graph.edges(data='weight')        except Exception, e:            print traceback.print_exc()    #---------------degree analysis-------------------------------------------------------------    # get the degree of all nodes, return a dict<node, degree>.    # directed graph work well, undirected graph does not test.    def get_degree(self):        try:            return self.graph.degree()        except Exception, e:            print traceback.print_exc()    # get the degree of a node, return an interger    def get_degree_by_node(self, node_id):        try:            return self.graph.degree(node_id)        except Exception, e:            print traceback.print_exc()    # get the degree of a node, but the degree is not viewed as sum of edges    # instead the degree is viewed as sum of the weight of edges    # eg: (1,2,0.5),(3,1,0.75) the degree based on weight of node 1 is 0.5+0.75 = 1.25(not 2)    def get_degree_based_on_weight_by_node(self, node_id):        try:            return self.graph.degree(node_id, weight="weight")        except Exception, e:            print traceback.print_exc()    # get sorted degrees, return a list. the item of a list is degree value of a node    def get_sorted_degrees(self):        try:            return sorted(nx.degree(self.graph).values(), reverse=True)        except Exception, e:            print traceback.print_exc()    # get the indegree of all nodes.    def get_in_degree(self):        try:            return self.graph.in_degree()        except Exception, e:            print traceback.print_exc()    # get the indegree of a node    def get_in_degree_by_node(self, node_id):        try:            return self.graph.in_degree(node_id)        except Exception, e:            print traceback.print_exc()    def get_in_degree_based_on_weight_by_node(self, node_id):        try:            return self.graph.in_degree(node_id, weight = "weight")        except Exception, e:            print traceback.print_exc()    # get the outdegree of all nodes    def get_out_degree(self):        try:            return self.graph.out_degree()        except Exception, e:            print traceback.print_exc()    # get the outdegree of a node    def get_out_degree_by_node(self, node_id):        try:            return self.graph.out_degree(node_id)        except Exception, e:            print traceback.print_exc()    def get_out_degree_based_on_weight_by_node(self, node_id):        try:            return self.graph.out_degree(node_id, weight="weight")        except Exception, e:            print traceback.print_exc()    # ----------component analysis-----------------    # get connected components - return a list of set which is a component    def get_connected_components(self):        try:            return nx.connected_components(self.graph)        except Exception, e:            print traceback.print_exc()    # ----------drawing graph-----------------------    def draw_graph(self,title):        try:            plt.title(title)            nx.draw(self.graph)            plt.show(title)        except Exception, e:            print traceback.print_exc()    def draw_network(self):        try:            nx.draw_networkx(self.graph, nx.spring_layout)            plt.show()        except Exception,e:            print traceback.print_exc()    def draw_graph_random_layout(self):        try:            nx.draw_random(self.graph)            plt.show()        except Exception,e:            print traceback.print_exc()    def draw_graph_spring_layout(self):        try:            nx.draw_spring(self.graph)            plt.show()        except Exception,e:            print traceback.print_exc()    # ---------- Graph methods--------------------------    # return a list of the frequency of each degree value    # 这个函数我说明一下,之前的degree函数返回的是每个节点的度,但是度分布则是统计了度为某个值的个数。下面的函数    # 很好的完成了这个任务,就是统计了度分布,当然最后一项是还有值的情形    def get_degree_distribution(self):        try:            return nx.degree_histogram(self.graph)        except Exception,e:            print traceback.print_exc()    def get_density(self):        try:            return nx.density(self.graph)        except Exception,e:            print traceback.print_exc()    # get the transitivity - global clustering coefficient    def get_transitivity(self):        try:            return nx.transitivity(self.graph)        except Exception,e:            print traceback.print_exc()    def get_averate_clustering(self):        try:            return nx.average_clustering(self.graph)        except Exception,e:            print traceback.print_exc()    def get_average_shortest_path_length(self):        try:            return nx.average_shortest_path_length(self.graph)        except Exception,e:            print traceback.print_exc()    def write_to_pajek(self, pajek_net_path):        try:            nx.write_pajek(self.graph, pajek_net_path)        except Exception,e:            print traceback.print_exc()    #--------------------------------------------------------    #--------------centrality--------------------------------    #--------------------------------------------------------    # The degree centrality for a node v is the fraction of nodes it is connected to.    def get_degree_centrality(self):        try:            return nx.degree_centrality(self.graph)        except Exception,e:            print traceback.print_exc()    # Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v    def get_betweenness_centrality(self):        try:            return nx.betweenness_centrality(self.graph)        except Exception,e:            print traceback.print_exc()    # The load centrality of a node is the fraction of all shortest paths that pass through that node.    def get_load_centrality(self):        try:            return nx.load_centrality(self.graph)        except Exception,e:            print traceback.print_exc()    # Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors.    def get_eigenvector_centrality(self):        try:            return nx.eigenvector_centrality(self.graph)        except Exception,e:            print traceback.print_exc()
  • MyGraph.py
#-*- coding:utf-8 -*-from GraphOperation import*'''基于我自己的工具类MyGraph写一个图的操作类,实现图的各种操作'''class MyGraph:    # 构造函数 - 主要是为了定义成员变量    def __init__(self):        self.my_graph = GraphOperation()        self.map_name_to_number = dict()        self.map_number_to_name = dict()        self.output_path = ""        self.clique_list = [] # for draw_community        self.max_connected_component_subgraph = None    # 构造图 - 初始化两个mapper,并且构造图    def construct_graph(self, clique_list):        try:            # convert the name to number and store the relation in map_name_to_number            number = 1            new_clique_list = []            for clique in clique_list:                new_clique = []                for u in clique:                    if u in self.map_name_to_number:                        new_clique.append(self.map_name_to_number[u])                    else:                        self.map_name_to_number[u] = number                        number += 1                        new_clique.append(self.map_name_to_number[u])                new_clique_list.append(new_clique)            # convert the number to name and store the relation in map_number_to_name            self.map_number_to_name = dict()            for name, number in self.map_name_to_number.items():                self.map_number_to_name[number] = name            self.clique_list = new_clique_list            # construct graph based on the new_clique_list            for clique in new_clique_list:                # add all edges                for u in clique:                    # add a single node in case there exists node itself                    self.my_graph.add_node(u)                    for v in clique:                        if (u == v):                            continue                        e = (u, v)                        self.my_graph.add_edge_by_tuple(e)            print "[INFO]: construct_graph is finished!"        except Exception,e:            print traceback.print_exc()    # 加入一条边    def add_edge(self, u, v):        try:            self.my_graph.add_edge(u, v)        except Exception,e:            print traceback.print_exc()    # 获得所有边    def get_all_edges(self):        try:            return self.my_graph.get_edges()        except Exception,e:            print traceback.print_exc()    # 设置网络特征的输出路径    def set_output_path(self, output_path):        try:            self.output_path = output_path            print "[INFO]: set_output_path is finished!"        except Exception,e:            print traceback.print_exc()    # 获得最大联通分量    # 由于必须是在整个图生成之后,才能获得最大联通分量    # 所以这个方法必须写在封装的第二层,第一层的类写的不够好。不能直接封装    def set_max_connected_component_subgraph(self):        try:            self.max_connected_component_subgraph = max(nx.connected_component_subgraphs(self.my_graph.graph), key=len)            print "[INFO]: set_max_connected_component_subgraph is finished!"        except Exception,e:            print traceback.print_exc()    # 返回的是原生的nx.Graph()    def get_max_connected_component_subgraph(self):        try:            return self.max_connected_component_subgraph        except Exception,e:            print traceback.print_exc()    #-----------------------------------------------------------------------    #-----------------------draw the network--------------------------------    #-----------------------------------------------------------------------    # 按照不同的社团进行绘图 - 不同社团具有不同的颜色    # 逻辑是 不同的社团分别加入进去,然后配置颜色,绘图    # 因为少了一层封装,所以掉用的时候只能按照最底层的凡是去调用,这样其实不好。    # 为此,还增加了成员变量,保存clique_list    def draw_community(self):        try:            # 初始信息            #pos = nx.spring_layout(self.my_graph.graph)            pos = nx.spring_layout(self.my_graph.graph)            node_size_ = 100            color_list = ["red", "yellow", "blue", "green", "pink", "orange", "purple"]            #color_list = ["red", "yello", "blue", "green"]            color_list_len = len(color_list)            # add node and edges            for i, node_list in enumerate(self.clique_list):                edge_list = self.get_edges_for_community(node_list)                # 以下两个函数参数太多,先暂时不直接封装                #nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i%color_list_len])                nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i], label="hello")                nx.draw_networkx_edges(self.my_graph.graph, pos, edge_list)            #title = "Collaboration Network"            title = "people relation by train"            plt.title(title)            plt.show()            print "[INFO]: draw_community is finished!"        except Exception,e:            print traceback.print_exc()    def get_edges_for_community(self, node_list):        try:            edge_list = []            for u in node_list:                for v in node_list:                    if u == v:                        continue                    else:                        edge_list.append((u,v))            return edge_list        except Exception,e:            print traceback.print_exc()    # 基本画图    def draw_graph(self,title):        try:            self.my_graph.draw_graph(title)            print "[INFO]: draw_graph is finished!"        except Exception,e:            print traceback.print_exc()    def draw_network(self):        try:            self.draw_network()        except Exception,e:            print traceback.print_exc()    def draw_graph_random_layout(self):        try:            self.my_graph.draw_graph_random()        except Exception,e:            print traceback.print_exc()    def draw_graph_spring_layout(self):        try:            self.my_graph.draw_graph_spring_layout()            print "[INFO]: draw_graph is finished!"        except Exception,e:            print traceback.print_exc()    #-----------------------------------------------------------------------    #-----------------------network analysis--------------------------------    #-----------------------------------------------------------------------    # 计算节点数    def cal_num_of_nodes(self):        try:            num_nodes = self.my_graph.get_number_of_nodes()            file_path = self.output_path+"number_of_nodes.txt"            outfile = open(file_path, "w")            outfile.write(str(num_nodes) + '\n')            outfile.close()            print "[INFO]: cal_num_of_nodes is finished!"        except Exception,e:            print traceback.print_exc()    # 计算边数    def cal_num_of_edges(self):        try:            num_edges = self.my_graph.get_number_of_edges()            file_path = self.output_path + "number_of_edges.txt"            outfile = open(file_path, "w")            outfile.write(str(num_edges) + '\n')            outfile.close()            print "[INFO]: cal_num_of_edges is finished!"        except Exception, e:            print traceback.print_exc()    # 计算度分布    def cal_degree_distribution(self):        try:            degree_distribution_list = self.my_graph.get_degree_distribution()            file_path = self.output_path + "degree_distribution.txt"            outfile = open(file_path, "w")            for item in degree_distribution_list:                line = str(item) + '\n'                outfile.write(line)            outfile.close()            print "[INFO]: cal_degree_distribution is finished!"        except Exception, e:            print traceback.print_exc()    # 计算网络密度    def cal_density(self):        try:            density = self.my_graph.get_density()            file_path = self.output_path + "graph_density.txt"            outfile = open(file_path, "w")            outfile.write(str(density) + '\n')            outfile.close()            print "[INFO]: cal_density is finished!"        except Exception, e:            print traceback.print_exc()    # 计算聚集系数    def cal_transitivity(self):        try:            transitivity = self.my_graph.get_transitivity()            file_path = self.output_path + "transitivity.txt"            outfile = open(file_path, "w")            outfile.write(str(transitivity) + '\n')            outfile.close()            print "[INFO]: cal_transitivity is finished!"        except Exception, e:            print traceback.print_exc()    def cal_average_clustering(self):        try:            average_clustering = self.my_graph.get_averate_clustering()            file_path = self.output_path + "average_clustering.txt"            outfile = open(file_path, "w")            outfile.write(str(average_clustering) + '\n')            outfile.close()            print "[INFO]: cal_average_clustering is finished!"        except Exception,e:            print traceback.print_exc()    # 计算平均距离    def cal_average_shortest_path_length(self):        try:            aver_shortest_path = self.my_graph.get_average_shortest_path_length()            file_path = self.output_path + "average_shortest_path_length.txt"            outfile = open(file_path, "w")            outfile.write(str(aver_shortest_path) + '\n')            outfile.close()            print "[INFO]: cal_average_shortest_path_length is finished!"        except Exception, e:            print traceback.print_exc()    # 写入pajek格式文件    def write_to_pajek_net(self):        try:            output_path = self.output_path + "graph_of_author_relation.net"            # write to net file            outfile = open(output_path, "w")            nodes_num = self.my_graph.get_number_of_nodes()            edges_num = self.my_graph.get_number_of_edges()            first_line_of_node = "*Vertices " + str(nodes_num) + '\n'            first_line_of_edge = "*Edges " + str(edges_num) + '\n'            outfile.write(first_line_of_node)            nodes_list = self.my_graph.get_nodes()            for node in nodes_list:                line = ""                line += str(node) + ' ' + "\"" + str(self.map_number_name[node]) + "\"" + '\n'                outfile.write(line)            outfile.write(first_line_of_edge)            edges_list = self.my_graph.get_edges()            for edge in edges_list:                line = ""                line += str(edge[0]) + ' ' + str(edge[1]) + '\n'                outfile.write(line)            outfile.close()            print "[INFO]: write_to_pajek_net is finished!"        except Exception, e:            print traceback.print_exc()    def write_to_pajek_net1(self):        try:            pajek_net_path = self.output_path + "graph_of_author_relation.net"            self.my_graph.write_to_pajek(pajek_net_path)            print "[INFO]: write_to_pajek_net1 is finished!"        except Exception, e:            print traceback.print_exc()    #--------------------------------------------------------    #--------------centrality--------------------------------    #--------------------------------------------------------    def get_degree_centrality(self):        try:            return self.my_graph.get_degree_centrality()            print "[INFO]: get_degree_centrality is finished!"        except Exception,e:            print traceback.print_exc()    def get_betweenness_centrality(self):        try:            return self.my_graph.get_betweenness_centrality()            print "[INFO]: get_betweenness_centrality is finished!"        except Exception, e:            print traceback.print_exc()    def get_load_centrality(self):        try:            return self.my_graph.get_load_centrality()            print "[INFO]: get_load_centrality is finished!"        except Exception, e:            print traceback.print_exc()    def get_eigenvector_centrality(self):        try:            return self.my_graph.get_eigenvector_centrality()            print "[INFO]: get_eigenvector_centrality is finished!"        except Exception, e:            print traceback.print_exc()    # --------------------------------------------------------    # --------------component--------------------------------    # --------------------------------------------------------    def draw_max_connected_component_subgraph(self):        try:            nx.draw_networkx(self.get_max_connected_component_subgraph(),with_labels = False)            title = "Max connected subgraph of Collaboration Network"            plt.title(title)            plt.show()            print "[INFO]: draw_max_connected_component_subgraph is finished!"        except Exception, e:            print traceback.print_exc()    def get_average_shortest_path_length_in_max_connected_component_subgraph(self):        try:            res = nx.average_shortest_path_length(self.get_max_connected_component_subgraph())            print "[INFO]: draw_max_connected_component_subgraph is finished!"            return res        except Exception, e:            print traceback.print_exc()    def cal_average_shortest_path_length_in_max_connected_component_subgraph(self):        try:            aver_shortest_path = self.get_average_shortest_path_length_in_max_connected_component_subgraph()            file_path = self.output_path + "average_shortest_path_length_in_max_connected_subgraph.txt"            outfile = open(file_path, "w")            outfile.write(str(aver_shortest_path) + '\n')            outfile.close()            print "[INFO]: cal_average_shortest_path_length_in_max_connected_component_subgraph is finished!"        except Exception, e:            print traceback.print_exc()#----------------------------------------------------------------------------

下面这一部分代码就不针对networkx了,主要是xml的封装类,以及测试部分的代码
- XmlParser

#-*- coding:utf-8import xml.etree.ElementTree as etimport traceback'''基于XML的数据提取以及分析其实我只可以负责数据提取但是毕竟是同一个XML,所以把数据分析写进来我认为也是合理的'''class XmlParser:    def __init__(self, xml_path, stop_words_path):        self.stop_words_path = stop_words_path        tree = et.parse(xml_path)        self.root = tree.getroot()    # 1-pubmed 获取文章作者    def get_article_author(self):        try:            res_list = []            for pubmed_article in self.root:                try:                    #print "---------------------------------------------------"                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    article = medline_citation.findall("Article")[0]                    author_list = article.findall("AuthorList")[0]                    author_list = author_list.findall("Author")                    current_authour_list = []                    for author in author_list:                        try:                            last_name = author.findall("LastName")[0]                            initials = author.findall("Initials")[0]                            name = str(last_name.text) + ' ' + str(initials.text)                            current_authour_list.append(name)                            #print name                        except:                            continue                    res_list.append(current_authour_list)                except:                    continue            return res_list        except Exception, e:            print traceback.print_exc()    # 1-1 PMC 获取文章作者    def get_article_author1(self):        try:            res_list = []            for article in self.root:                try:                    author_list = []                    #print pubmed_article                    #print "---------------------------------------------------"                    front = article.findall("front")[0]                    article_meta = front.findall("article-meta")[0]                    contrib_group = article_meta.findall("contrib-group")[0]                    contrib_list = contrib_group.findall("contrib")                    for contrib in contrib_list:                        name = contrib.findall("name")[0]                        surname = name.findall("surname")[0]                        given_name = name.findall("given-names")[0]                        final_name = ""                        final_name += str(given_name.text) + " " + str(surname.text)                        author_list.append(final_name)                        #print final_name                    res_list.append(author_list)                except:                    continue            return res_list        except Exception, e:            print traceback.print_exc()    # 2_获得文章标题    def get_article_title(self, root):        try:            article_title_list = []            for pubmed_article in root:                try:                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    article = medline_citation.findall("Article")[0]                    article_title = article.findall("ArticleTitle")[0]                    article_title = str(article_title.text)                    #print article_title                    article_title_list.append(article_title)                except:                    continue            return article_title_list        except Exception,e:            print traceback.print_exc()    # 3_获取年份    def get_article_year(self, root):        try:            article_year_list = []            cnt = 0            for pubmed_article in root:                try:                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    article = medline_citation.findall("Article")[0]                    article_journal = article.findall("Journal")[0]                    article_journal_issue = article_journal.findall("JournalIssue")[0]                    pub_date = article_journal_issue.findall("PubDate")[0]                    year = pub_date.findall("Year")[0]                    year = str(year.text)                    article_year_list.append(year)                except:                    continue            return article_year_list        except Exception, e:            print traceback.print_exc()    # 4_获取出版社名称    def get_article_journal_title(self, root):        try:            journal_title_list = []            for pubmed_article in root:                try:                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    article = medline_citation.findall("Article")[0]                    article_journal = article.findall("Journal")[0]                    article_journal_title = article_journal.findall("Title")[0]                    journal_title = str(article_journal_title.text)                    journal_title_list.append(journal_title)                except:                    continue            return journal_title_list        except Exception, e:            print traceback.print_exc()    # 5_pubmed获取文章摘要    def get_article_abstract(self, root):        try:            article_abstract_list = []            cnt = 0            for pubmed_article in root:                try:                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    article = medline_citation.findall("Article")[0]                    article_abstract = article.findall("Abstract")[0]                    article_abstract_text = article_abstract.findall("AbstractText")[0]                    # 考虑有些文章不存在摘要的情形                    if article_abstract_text is not None :                        cnt += 1                        abstract = str(article_abstract_text.text)                        #print cnt, " ", abstract                        article_abstract_list.append(abstract)                except:                    continue            return article_abstract_list        except Exception, e:            print traceback.print_exc()    # 5-1_pmc_获取文章作者    def get_article_abstract1(self):        try:            res_list = []            for article in self.root:                try:                    author_list = []                    # print pubmed_article                    # print "---------------------------------------------------"                    front = article.findall("front")[0]                    article_meta = front.findall("article-meta")[0]                    abstract = article_meta.findall("abstract")[0]                    abstract_p = abstract.findall("p")[0]                    res_list.append(abstract_p.text)                except:                    continue            return res_list        except Exception, e:            print traceback.print_exc()    # 6_获取出版社名称 - (名字,位置)    def get_article_journal_info(self, root):        try:            # journal_country_list = []            # journal_name_list = []            journal_info_list = []            for pubmed_article in root:                try:                    medline_citation = pubmed_article.findall("MedlineCitation")[0]                    journal_info = medline_citation.findall("MedlineJournalInfo")[0]                    journal_country = str(journal_info.findall("Country")[0].text)                    journal_name = str(journal_info.findall("MedlineTA")[0].text)                    journal_info_list.append(journal_name + ',' + journal_country)                except:                    continue            return journal_info_list        except Exception, e:            print traceback.print_exc()#---------------------------------------------------------##                     计算统计特征                          -##----------------------------------------------------------#    # 7_计算每年所发文章数    def cal_num_of_article_in_each_year(self, write_path):        try:            year_list = self.get_article_year(self.root)            counter = dict()            #total = len(year_list)            #print "TOTAL articles: ", total            for y in year_list:                if y in counter :                    counter[y] += 1                else:                    counter[y] = 1            pairs = list(counter.items())            pairs.sort(reverse=True)            outfile = open(write_path, "w")            for pair in pairs:                line = str(pair[0]) + "\t" + str(pair[1])                outfile.write(line +'\n')            outfile.close()        except Exception, e:            print traceback.print_exc()    # 8_pubmed计算文章标题中词频    def cal_word_occurence_in_article_title(self,output_path):        try:            article_list = self.get_article_title(self.root)            stop_words_list = self.get_stop_words(self.stop_words_path)            stop_words_list.append(' ')            stop_words_list.append('')  # 这个要占很大的地方            word_counter = dict()            for article in article_list:                try:                    # 预处理                    line = ""                    for ch in article:                        if ch.isalpha():                            line += ch                        else:                            line += ' '                    article = line                    article = article.split(' ')                    for word in article:                        word = word.lower()                        if word in stop_words_list:                            continue                        if word in word_counter:                            word_counter[word] += 1                        else:                            word_counter[word] = 1                except:                    continue            pairs = list(word_counter.items())            items = [(count,word) for (word,count) in pairs]            items.sort(reverse=True)            write_path = output_path + "word_occurence_in_article_title.txt"            outfile = open(write_path,"w")            final_str = ""            final_freq = ""            cnt = 0            for item in items:                line =  str(item[1]) + "\t" + str(item[0])                outfile.write(line +'\n')                if cnt < 10:                    if cnt == 0:                        final_str = "'" + item[1] + "'" + final_str                        final_freq = "'" + str(item[0]) + "'" + final_freq                    else:                        final_str = "'" + item[1] + "'" + ',' + final_str                        final_freq = "'" + str(item[0]) + "'" + ',' + final_freq                cnt += 1            final_str = '[' + final_str + ']'            final_freq = '[' + final_freq + ']'            outfile.write(final_str + '\n')            outfile.write(final_freq + '\n')            outfile.close()        except Exception, e:            print traceback.print_exc()    # 9_pubmed计算文章摘要中词频    def cal_word_occurence_in_article_abstract(self, output_path):        try:            abstract_list = self.get_article_abstract(self.root)            stop_words_list = self.get_stop_words(self.stop_words_path)            stop_words_list.append(' ')            stop_words_list.append('')  # 这个要占很大的地方            word_counter = dict()            for abstract in abstract_list:                try:                    # 预处理                    line = ""                    for ch in abstract:                        if ch.isalpha():                            line += ch                        else:                            line += ' '                    abstract = line                    abstract = abstract.split(' ')                    for word in abstract:                        word = word.lower()                        if word in stop_words_list:                            continue                        if word in word_counter:                            word_counter[word] += 1                        else:                            word_counter[word] = 1                except:                    continue            pairs = list(word_counter.items())            items = [(count, word) for (word, count) in pairs]            items.sort(reverse=True)            write_path = output_path + "word_occurence_in_article_abstract.txt"            outfile = open(write_path, "w")            final_str = ""            final_freq = ""            cnt = 0            for item in items:                line = str(item[1]) + "\t" + str(item[0])                outfile.write(line + '\n')                if cnt < 10:                    if cnt == 0:                        final_str = "'" + item[1] + "'" + final_str                        final_freq = "'" + str(item[0]) + "'"+ final_freq                    else:                        final_str = "'"+item[1]+"'" + ',' + final_str                        final_freq = "'" + str(item[0]) + "'" + ',' + final_freq                cnt += 1            final_str = '[' + final_str + ']'            final_freq = '[' + final_freq + ']'            outfile.write(final_str + '\n')            outfile.write(final_freq + '\n')            outfile.close()        except Exception, e:            print traceback.print_exc()    # 9_1_pmc计算文章摘要中词频    def cal_word_occurence_in_article_abstract1(self, write_path):        try:            abstract_list = self.get_article_abstract1()            stop_words_list = self.get_stop_words(self.stop_words_path)            stop_words_list.append(' ')            stop_words_list.append('') # 这个要占很大的地方            word_counter = dict()            for abstract in abstract_list:                try:                    # 预处理                    line = ""                    for ch in abstract:                        if ch.isalpha():                            line += ch                        else:                            line += ' '                    abstract = line                    abstract = abstract.split(' ')                    for word in abstract:                        word = word.lower()                        if word in stop_words_list:                            continue                        if word in word_counter:                            word_counter[word] += 1                        else:                            word_counter[word] = 1                except:                    continue            pairs = list(word_counter.items())            items = [(count, word) for (word, count) in pairs]            items.sort(reverse=True)            #for item in items:            #    print item[0], '\t', item[1]            outfile = open(write_path, "w")            for item in items:                try:                    line = ""                    line = str(item[1]) + '\t' + str(item[0])                    outfile.write(line+'\n')                except Exception as ex:                    print ex            outfile.close()        except Exception, e:            print traceback.print_exc()    # 10_计算期刊的名字以及其地理位置的出现次数    def cal_journal_name_and_country_ouucrence(self, country_path, name_path):        try:            name_counter = dict()            country_counter = dict()            journal_info_list = self.get_article_journal_info(self.root)            for item in journal_info_list:                item = item.split(',')                journal_name = item[0]                journal_country = item[1]                if journal_name in name_counter:                    name_counter[journal_name] += 1                else:                    name_counter[journal_name] = 1                if journal_country in country_counter:                    country_counter[journal_country] += 1                else:                    country_counter[journal_country] = 1            pairs = list(name_counter.items())            reverse_pairs = [ (count,name) for (name,count) in pairs ]            reverse_pairs.sort(reverse=True)            outfile = open(name_path, "w")            for item in reverse_pairs:                name = str(item[1])                count = str(item[0])                line = ""                line += name                line += '\t'                line += count                outfile.write(line + '\n')            outfile.close()            pairs = list(country_counter.items())            reverse_pairs = [(count, country) for (country, count) in pairs]            reverse_pairs.sort(reverse=True)            outfile = open(country_path, "w")            for item in reverse_pairs:                name = str(item[1])                count = str(item[0])                line = ""                line += name                line += '\t'                line += count                outfile.write(line + '\n')            outfile.close()        except Exception, e:            print traceback.print_exc()    # 11_计算发布量前10的论文,在不同区的数量    def cal_num_in_diff_area(self, input_path, out_path):        try:            area_counter = {}            cnt = 0            infile = open(input_path, "r")            for line in infile:                cnt += 1                if cnt == 1:                    continue                line = line.rstrip('\n').split(' ')                num = int(line[1])                area = line[3]                if area in area_counter:                    area_counter[area] += num                else:                    area_counter[area] = num            infile.close()            outfile = open(out_path, "w")            for area in area_counter:                line = ""                line += str(area)                line += " "                line += str(area_counter[area])                outfile.write(line + '\n')            outfile.close()        except Exception, e:            print traceback.print_exc()    # 12_计算影响因子    def cal_aver_if_factor(self, input_path):        try:            cnt = 0            infile = open(input_path, "r")            total_num = 0            total_factor = 0.0            for line in infile:                cnt += 1                if cnt == 1:                    continue                line = line.rstrip('\n').split(' ')                num = int(line[1])                factor = float(line[2])                total_num += num                total_factor += factor * num            infile.close()            print total_factor / total_num        except Exception, e:            print traceback.print_exc()    # 13_获取停用词    def get_stop_words(self, stop_words_path):        result_list = []        infile = open(stop_words_path, "r")        for line in infile:            line = line.rstrip('\n')            result_list.append(line)        infile.close()        return result_list    # 14_测试函数    def test(self):        journal_info_list = self.get_article_journal_info(self.root)        print len(journal_info_list)        for aa in journal_info_list:            print aa
  • main.py
#-*- coding:utf-8 -*-from XmlParser import*from MyGraph import*STOP_WORDS_PATH = "../file/stop_words.txt"XML_PATH1 = "../data/PUBMED/LANCET/2006/lancet_2006_1570.xml"#XML_PATH2 = "../data/PUBMED/LANCET/2009/lancet_2009_1516.xml"#OUTPUT_PATH1 = "../output/network_analysis/PUBMED/LANCET/2006/"#OUTPUT_PATH2 = "../output/network_analysis/PUBMED/LANCET/2009/"OUTPUT_PATH3 = "../output/src_output/edge.txt"INPUT_PATH = "../data/src_input/citation.csv"OUTPUT_PATH = "../output/src_output/"# @xml_parser_obj:xml解析后的对象# @OUTPUT_PATH:统计分析之后的输出路径def statical_analysis( xml_parser_obj, OUTPUT_PATH ):    try:        xml_parser_obj.cal_word_occurence_in_article_abstract(OUTPUT_PATH)        xml_parser_obj.cal_word_occurence_in_article_title(OUTPUT_PATH)        print "[INFO]: statical_analysis is finished!"    except Exception,e:        print traceback.print_exc()# @xml_parser_obj:xml解析后的对象# @OUTPUT_PATH: 网络静态分析之后的输出路径def author_collaboration_network_analysis( xml_parser_obj, OUTPUT_PATH ):    try:        # get the author clique list        author_clique_list = xml_parser_obj.get_article_author()        # construct the graph based on the author clique list        graph = MyGraph()        graph.construct_graph(author_clique_list)        graph.set_output_path(OUTPUT_PATH)        # calculate the statistics        graph.cal_num_of_nodes()        graph.cal_num_of_edges()        graph.cal_degree_distribution()        graph.cal_density()        # the colloboration network is usually not connected        #graph.cal_average_shortest_path_length()        graph.cal_average_clustering()        graph.write_to_pajek_net1()        # 这个函数并不是真的画社团 只是把不同clique画出来而已 画的是整个的图        graph.draw_community()        graph.set_max_connected_component_subgraph()        graph.draw_max_connected_component_subgraph()        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()        #graph.draw_graph()        #graph.draw_graph_spring_layout()        #graph.draw_graph_random()        print "[INFO]: author_collaboration_network_analysis is finished!"    except Exception,e:        print traceback.print_exc()def author_collaboration_network_analysis1( xml_parser_obj1, xml_parser_obj2, OUTPUT_PATH ):    try:        # get the author clique list        author_clique_list = xml_parser_obj1.get_article_author()        author_clique_list.extend(xml_parser_obj2.get_article_author())        # construct the graph based on the author clique list        graph = MyGraph()        graph.construct_graph(author_clique_list)        graph.set_output_path(OUTPUT_PATH)        # calculate the statistics        graph.cal_num_of_nodes()        graph.cal_num_of_edges()        graph.cal_degree_distribution()        graph.cal_density()        graph.cal_average_shortest_path_length()        graph.cal_average_clustering()        graph.write_to_pajek_net1()        graph.draw_community()        #graph.draw_graph()        #graph.draw_graph_spring_layout()        #graph.draw_graph_random()        print "[INFO]: author_collaboration_network_analysis is finished!"    except Exception,e:        print traceback.print_exc()def test_for_srx():    try:        graph = MyGraph()        graph.set_output_path(OUTPUT_PATH)        for line in file(INPUT_PATH, "r"):            u = line.split(',')[0]            v = line.split(',')[1]            graph.add_edge(u, v)        print "[INFO]: graph is finished!"        graph.cal_average_clustering()        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()        graph.cal_degree_distribution()        graph.cal_density()        graph.cal_transitivity()    except Exception,e:        print traceback.print_exc()def test_for_jcx():    try:        graph = MyGraph()        graph.set_output_path(OUTPUT_PATH)        cnt = 0        for line in file(INPUT_PATH,"r"):            u =line.split()[0]            v =line.split()[1]            graph.add_edge(u,v)            cnt += 1            if(cnt == 10000):                break;        print "[INFO]: graph is finished!"        '''        graph.cal_average_clustering()        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()        graph.cal_degree_distribution()        graph.cal_density()        graph.cal_transitivity()        '''        title = "Social Network - Live Journal"        graph.draw_graph(title)    except Exception,e:        print traceback.print_exc()def main():    try:        print "[INFO]: Programme is running......"        # parse the xml and get the result        #a_obj1 = XmlParser(XML_PATH1, STOP_WORDS_PATH)        #a_obj2 = XmlParser(XML_PATH2, STOP_WORDS_PATH)        #statical_analysis(a_obj1, OUTPUT_PATH1)        #statical_analysis(a_obj2, OUTPUT_PATH2)        #author_collaboration_network_analysis(a_obj1, OUTPUT_PATH1)        test_for_srx()        print "[INFO]: Programme terminated successfully!"    except Exception, e:        print traceback.print_exc()main()
0 0
原创粉丝点击