python网络分析-network第一版的封装
来源:互联网 发布:mac终端c语言 编辑:程序博客网 时间:2024/06/05 16:52
本文主要是对我在写实验的时候所用到的networkx进行的一个初步的封装。其实不是很标准,现在再写第二版。先把之前的代码贴上来。主要参考的文档就是networkx的官方文档。
[networkx-reference]
我需要说明一点,下面的代码针对的是无向图。
代码
下面这一部分代码是对networkx的初步封装。
- GraphOperation.py
#-*- coding:utf-8 -*-import networkx as nximport matplotlib.pyplot as pltimport traceback'''我对networkx 的封装还是一个图操作-工具类'''class GraphOperation: #-----------------graph operation----------------- # construct a graph - undirected graph if default def __init__(self): self.graph = nx.Graph() def convert_to_directed_graph(self): self.graph = nx.DiGraph() def convert_to_multi_graph(self): self.graph = nx.MultiGraph() # only directed graph can do this operation def convert_to_undirected_graph(self): self.graph = nx.Graph() # clear the graph def clear_graph(self): try: self.graph.clear() except Exception, e: print traceback.print_exc() #------------------node operation---------------------------- # add a node def add_node(self, node): try: self.graph.add_node(node) except Exception,e: print traceback.print_exc() # add a list of nodes def add_nodes_by_list(self, node_list): try: self.graph.add_nodes_from(node_list) except Exception,e: print traceback.print_exc() # remove a node def remove_node(self, node): try: self.graph.remove_node(node) except Exception,e: print traceback.print_exc() # remove a list of nodes def remove_nodes_by_list(self, node_list): try: self.graph.remove_nodes_from(node_list) except Exception,e: print traceback.print_exc() # get number of nodes def get_number_of_nodes(self): try: return self.graph.number_of_nodes() except Exception, e: print traceback.print_exc() # get nodes, return a list of nodes def get_nodes(self): try: return self.graph.nodes() except Exception, e: print traceback.print_exc() # get neighbors of v, return a list of nodes which is the neighbor of v def get_neighbors(self, v): try: return self.graph.neighbors(v) except Exception, e: print traceback.print_exc() #---------------edge operation------------------------------ # add an edge def add_edge(self,u,v): try: self.graph.add_edge(u,v) except Exception,e: print traceback.print_exc() # add an edge by a tuple def add_edge_by_tuple(self,e): try: self.add_edge(*e) # unpack edge tuple except Exception,e: print traceback.print_exc() # add edges by list which is compromised of tuples, every tuple is an edge def add_edges_by_list(self, edge_list): try: self.graph.add_edges_from(edge_list) except Exception,e: print traceback.print_exc() # remove an edge def remove_edge(self,u ,v ): try: self.graph.remove_edge(u, v) except Exception,e: print traceback.print_exc() # remove an edge by tuple def remove_edge_by_tuple(self, e): try: self.remove_edge(*e) except Exception,e: print traceback.print_exc() # remove edges by list which is compromised of tuples def remove_edges_by_list(self, edge_list): try: self.remove_edges_from(edge_list) except Exception, e: print traceback.print_exc() # get number of edges def get_number_of_edges(self): try: return self.graph.number_of_edges() except Exception, e: print traceback.print_exc() # get edges, return a list of tuple which is a presentation of an edge def get_edges(self): try: return self.graph.edges() except Exception, e: print traceback.print_exc() # add weighted list by a list which is compromised of tuples def add_weighted_edge(self, weighted_edge_list): try: self.graph.add_weighted_edges_from(weighted_edge_list) except Exception, e: print traceback.print_exc() # get weighted edge def get_weighted_edge(self): try: return self.graph.edges(data='weight') except Exception, e: print traceback.print_exc() #---------------degree analysis------------------------------------------------------------- # get the degree of all nodes, return a dict<node, degree>. # directed graph work well, undirected graph does not test. def get_degree(self): try: return self.graph.degree() except Exception, e: print traceback.print_exc() # get the degree of a node, return an interger def get_degree_by_node(self, node_id): try: return self.graph.degree(node_id) except Exception, e: print traceback.print_exc() # get the degree of a node, but the degree is not viewed as sum of edges # instead the degree is viewed as sum of the weight of edges # eg: (1,2,0.5),(3,1,0.75) the degree based on weight of node 1 is 0.5+0.75 = 1.25(not 2) def get_degree_based_on_weight_by_node(self, node_id): try: return self.graph.degree(node_id, weight="weight") except Exception, e: print traceback.print_exc() # get sorted degrees, return a list. the item of a list is degree value of a node def get_sorted_degrees(self): try: return sorted(nx.degree(self.graph).values(), reverse=True) except Exception, e: print traceback.print_exc() # get the indegree of all nodes. def get_in_degree(self): try: return self.graph.in_degree() except Exception, e: print traceback.print_exc() # get the indegree of a node def get_in_degree_by_node(self, node_id): try: return self.graph.in_degree(node_id) except Exception, e: print traceback.print_exc() def get_in_degree_based_on_weight_by_node(self, node_id): try: return self.graph.in_degree(node_id, weight = "weight") except Exception, e: print traceback.print_exc() # get the outdegree of all nodes def get_out_degree(self): try: return self.graph.out_degree() except Exception, e: print traceback.print_exc() # get the outdegree of a node def get_out_degree_by_node(self, node_id): try: return self.graph.out_degree(node_id) except Exception, e: print traceback.print_exc() def get_out_degree_based_on_weight_by_node(self, node_id): try: return self.graph.out_degree(node_id, weight="weight") except Exception, e: print traceback.print_exc() # ----------component analysis----------------- # get connected components - return a list of set which is a component def get_connected_components(self): try: return nx.connected_components(self.graph) except Exception, e: print traceback.print_exc() # ----------drawing graph----------------------- def draw_graph(self,title): try: plt.title(title) nx.draw(self.graph) plt.show(title) except Exception, e: print traceback.print_exc() def draw_network(self): try: nx.draw_networkx(self.graph, nx.spring_layout) plt.show() except Exception,e: print traceback.print_exc() def draw_graph_random_layout(self): try: nx.draw_random(self.graph) plt.show() except Exception,e: print traceback.print_exc() def draw_graph_spring_layout(self): try: nx.draw_spring(self.graph) plt.show() except Exception,e: print traceback.print_exc() # ---------- Graph methods-------------------------- # return a list of the frequency of each degree value # 这个函数我说明一下,之前的degree函数返回的是每个节点的度,但是度分布则是统计了度为某个值的个数。下面的函数 # 很好的完成了这个任务,就是统计了度分布,当然最后一项是还有值的情形 def get_degree_distribution(self): try: return nx.degree_histogram(self.graph) except Exception,e: print traceback.print_exc() def get_density(self): try: return nx.density(self.graph) except Exception,e: print traceback.print_exc() # get the transitivity - global clustering coefficient def get_transitivity(self): try: return nx.transitivity(self.graph) except Exception,e: print traceback.print_exc() def get_averate_clustering(self): try: return nx.average_clustering(self.graph) except Exception,e: print traceback.print_exc() def get_average_shortest_path_length(self): try: return nx.average_shortest_path_length(self.graph) except Exception,e: print traceback.print_exc() def write_to_pajek(self, pajek_net_path): try: nx.write_pajek(self.graph, pajek_net_path) except Exception,e: print traceback.print_exc() #-------------------------------------------------------- #--------------centrality-------------------------------- #-------------------------------------------------------- # The degree centrality for a node v is the fraction of nodes it is connected to. def get_degree_centrality(self): try: return nx.degree_centrality(self.graph) except Exception,e: print traceback.print_exc() # Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v def get_betweenness_centrality(self): try: return nx.betweenness_centrality(self.graph) except Exception,e: print traceback.print_exc() # The load centrality of a node is the fraction of all shortest paths that pass through that node. def get_load_centrality(self): try: return nx.load_centrality(self.graph) except Exception,e: print traceback.print_exc() # Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors. def get_eigenvector_centrality(self): try: return nx.eigenvector_centrality(self.graph) except Exception,e: print traceback.print_exc()
- MyGraph.py
#-*- coding:utf-8 -*-from GraphOperation import*'''基于我自己的工具类MyGraph写一个图的操作类,实现图的各种操作'''class MyGraph: # 构造函数 - 主要是为了定义成员变量 def __init__(self): self.my_graph = GraphOperation() self.map_name_to_number = dict() self.map_number_to_name = dict() self.output_path = "" self.clique_list = [] # for draw_community self.max_connected_component_subgraph = None # 构造图 - 初始化两个mapper,并且构造图 def construct_graph(self, clique_list): try: # convert the name to number and store the relation in map_name_to_number number = 1 new_clique_list = [] for clique in clique_list: new_clique = [] for u in clique: if u in self.map_name_to_number: new_clique.append(self.map_name_to_number[u]) else: self.map_name_to_number[u] = number number += 1 new_clique.append(self.map_name_to_number[u]) new_clique_list.append(new_clique) # convert the number to name and store the relation in map_number_to_name self.map_number_to_name = dict() for name, number in self.map_name_to_number.items(): self.map_number_to_name[number] = name self.clique_list = new_clique_list # construct graph based on the new_clique_list for clique in new_clique_list: # add all edges for u in clique: # add a single node in case there exists node itself self.my_graph.add_node(u) for v in clique: if (u == v): continue e = (u, v) self.my_graph.add_edge_by_tuple(e) print "[INFO]: construct_graph is finished!" except Exception,e: print traceback.print_exc() # 加入一条边 def add_edge(self, u, v): try: self.my_graph.add_edge(u, v) except Exception,e: print traceback.print_exc() # 获得所有边 def get_all_edges(self): try: return self.my_graph.get_edges() except Exception,e: print traceback.print_exc() # 设置网络特征的输出路径 def set_output_path(self, output_path): try: self.output_path = output_path print "[INFO]: set_output_path is finished!" except Exception,e: print traceback.print_exc() # 获得最大联通分量 # 由于必须是在整个图生成之后,才能获得最大联通分量 # 所以这个方法必须写在封装的第二层,第一层的类写的不够好。不能直接封装 def set_max_connected_component_subgraph(self): try: self.max_connected_component_subgraph = max(nx.connected_component_subgraphs(self.my_graph.graph), key=len) print "[INFO]: set_max_connected_component_subgraph is finished!" except Exception,e: print traceback.print_exc() # 返回的是原生的nx.Graph() def get_max_connected_component_subgraph(self): try: return self.max_connected_component_subgraph except Exception,e: print traceback.print_exc() #----------------------------------------------------------------------- #-----------------------draw the network-------------------------------- #----------------------------------------------------------------------- # 按照不同的社团进行绘图 - 不同社团具有不同的颜色 # 逻辑是 不同的社团分别加入进去,然后配置颜色,绘图 # 因为少了一层封装,所以掉用的时候只能按照最底层的凡是去调用,这样其实不好。 # 为此,还增加了成员变量,保存clique_list def draw_community(self): try: # 初始信息 #pos = nx.spring_layout(self.my_graph.graph) pos = nx.spring_layout(self.my_graph.graph) node_size_ = 100 color_list = ["red", "yellow", "blue", "green", "pink", "orange", "purple"] #color_list = ["red", "yello", "blue", "green"] color_list_len = len(color_list) # add node and edges for i, node_list in enumerate(self.clique_list): edge_list = self.get_edges_for_community(node_list) # 以下两个函数参数太多,先暂时不直接封装 #nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i%color_list_len]) nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i], label="hello") nx.draw_networkx_edges(self.my_graph.graph, pos, edge_list) #title = "Collaboration Network" title = "people relation by train" plt.title(title) plt.show() print "[INFO]: draw_community is finished!" except Exception,e: print traceback.print_exc() def get_edges_for_community(self, node_list): try: edge_list = [] for u in node_list: for v in node_list: if u == v: continue else: edge_list.append((u,v)) return edge_list except Exception,e: print traceback.print_exc() # 基本画图 def draw_graph(self,title): try: self.my_graph.draw_graph(title) print "[INFO]: draw_graph is finished!" except Exception,e: print traceback.print_exc() def draw_network(self): try: self.draw_network() except Exception,e: print traceback.print_exc() def draw_graph_random_layout(self): try: self.my_graph.draw_graph_random() except Exception,e: print traceback.print_exc() def draw_graph_spring_layout(self): try: self.my_graph.draw_graph_spring_layout() print "[INFO]: draw_graph is finished!" except Exception,e: print traceback.print_exc() #----------------------------------------------------------------------- #-----------------------network analysis-------------------------------- #----------------------------------------------------------------------- # 计算节点数 def cal_num_of_nodes(self): try: num_nodes = self.my_graph.get_number_of_nodes() file_path = self.output_path+"number_of_nodes.txt" outfile = open(file_path, "w") outfile.write(str(num_nodes) + '\n') outfile.close() print "[INFO]: cal_num_of_nodes is finished!" except Exception,e: print traceback.print_exc() # 计算边数 def cal_num_of_edges(self): try: num_edges = self.my_graph.get_number_of_edges() file_path = self.output_path + "number_of_edges.txt" outfile = open(file_path, "w") outfile.write(str(num_edges) + '\n') outfile.close() print "[INFO]: cal_num_of_edges is finished!" except Exception, e: print traceback.print_exc() # 计算度分布 def cal_degree_distribution(self): try: degree_distribution_list = self.my_graph.get_degree_distribution() file_path = self.output_path + "degree_distribution.txt" outfile = open(file_path, "w") for item in degree_distribution_list: line = str(item) + '\n' outfile.write(line) outfile.close() print "[INFO]: cal_degree_distribution is finished!" except Exception, e: print traceback.print_exc() # 计算网络密度 def cal_density(self): try: density = self.my_graph.get_density() file_path = self.output_path + "graph_density.txt" outfile = open(file_path, "w") outfile.write(str(density) + '\n') outfile.close() print "[INFO]: cal_density is finished!" except Exception, e: print traceback.print_exc() # 计算聚集系数 def cal_transitivity(self): try: transitivity = self.my_graph.get_transitivity() file_path = self.output_path + "transitivity.txt" outfile = open(file_path, "w") outfile.write(str(transitivity) + '\n') outfile.close() print "[INFO]: cal_transitivity is finished!" except Exception, e: print traceback.print_exc() def cal_average_clustering(self): try: average_clustering = self.my_graph.get_averate_clustering() file_path = self.output_path + "average_clustering.txt" outfile = open(file_path, "w") outfile.write(str(average_clustering) + '\n') outfile.close() print "[INFO]: cal_average_clustering is finished!" except Exception,e: print traceback.print_exc() # 计算平均距离 def cal_average_shortest_path_length(self): try: aver_shortest_path = self.my_graph.get_average_shortest_path_length() file_path = self.output_path + "average_shortest_path_length.txt" outfile = open(file_path, "w") outfile.write(str(aver_shortest_path) + '\n') outfile.close() print "[INFO]: cal_average_shortest_path_length is finished!" except Exception, e: print traceback.print_exc() # 写入pajek格式文件 def write_to_pajek_net(self): try: output_path = self.output_path + "graph_of_author_relation.net" # write to net file outfile = open(output_path, "w") nodes_num = self.my_graph.get_number_of_nodes() edges_num = self.my_graph.get_number_of_edges() first_line_of_node = "*Vertices " + str(nodes_num) + '\n' first_line_of_edge = "*Edges " + str(edges_num) + '\n' outfile.write(first_line_of_node) nodes_list = self.my_graph.get_nodes() for node in nodes_list: line = "" line += str(node) + ' ' + "\"" + str(self.map_number_name[node]) + "\"" + '\n' outfile.write(line) outfile.write(first_line_of_edge) edges_list = self.my_graph.get_edges() for edge in edges_list: line = "" line += str(edge[0]) + ' ' + str(edge[1]) + '\n' outfile.write(line) outfile.close() print "[INFO]: write_to_pajek_net is finished!" except Exception, e: print traceback.print_exc() def write_to_pajek_net1(self): try: pajek_net_path = self.output_path + "graph_of_author_relation.net" self.my_graph.write_to_pajek(pajek_net_path) print "[INFO]: write_to_pajek_net1 is finished!" except Exception, e: print traceback.print_exc() #-------------------------------------------------------- #--------------centrality-------------------------------- #-------------------------------------------------------- def get_degree_centrality(self): try: return self.my_graph.get_degree_centrality() print "[INFO]: get_degree_centrality is finished!" except Exception,e: print traceback.print_exc() def get_betweenness_centrality(self): try: return self.my_graph.get_betweenness_centrality() print "[INFO]: get_betweenness_centrality is finished!" except Exception, e: print traceback.print_exc() def get_load_centrality(self): try: return self.my_graph.get_load_centrality() print "[INFO]: get_load_centrality is finished!" except Exception, e: print traceback.print_exc() def get_eigenvector_centrality(self): try: return self.my_graph.get_eigenvector_centrality() print "[INFO]: get_eigenvector_centrality is finished!" except Exception, e: print traceback.print_exc() # -------------------------------------------------------- # --------------component-------------------------------- # -------------------------------------------------------- def draw_max_connected_component_subgraph(self): try: nx.draw_networkx(self.get_max_connected_component_subgraph(),with_labels = False) title = "Max connected subgraph of Collaboration Network" plt.title(title) plt.show() print "[INFO]: draw_max_connected_component_subgraph is finished!" except Exception, e: print traceback.print_exc() def get_average_shortest_path_length_in_max_connected_component_subgraph(self): try: res = nx.average_shortest_path_length(self.get_max_connected_component_subgraph()) print "[INFO]: draw_max_connected_component_subgraph is finished!" return res except Exception, e: print traceback.print_exc() def cal_average_shortest_path_length_in_max_connected_component_subgraph(self): try: aver_shortest_path = self.get_average_shortest_path_length_in_max_connected_component_subgraph() file_path = self.output_path + "average_shortest_path_length_in_max_connected_subgraph.txt" outfile = open(file_path, "w") outfile.write(str(aver_shortest_path) + '\n') outfile.close() print "[INFO]: cal_average_shortest_path_length_in_max_connected_component_subgraph is finished!" except Exception, e: print traceback.print_exc()#----------------------------------------------------------------------------
下面这一部分代码就不针对networkx了,主要是xml的封装类,以及测试部分的代码
- XmlParser
#-*- coding:utf-8import xml.etree.ElementTree as etimport traceback'''基于XML的数据提取以及分析其实我只可以负责数据提取但是毕竟是同一个XML,所以把数据分析写进来我认为也是合理的'''class XmlParser: def __init__(self, xml_path, stop_words_path): self.stop_words_path = stop_words_path tree = et.parse(xml_path) self.root = tree.getroot() # 1-pubmed 获取文章作者 def get_article_author(self): try: res_list = [] for pubmed_article in self.root: try: #print "---------------------------------------------------" medline_citation = pubmed_article.findall("MedlineCitation")[0] article = medline_citation.findall("Article")[0] author_list = article.findall("AuthorList")[0] author_list = author_list.findall("Author") current_authour_list = [] for author in author_list: try: last_name = author.findall("LastName")[0] initials = author.findall("Initials")[0] name = str(last_name.text) + ' ' + str(initials.text) current_authour_list.append(name) #print name except: continue res_list.append(current_authour_list) except: continue return res_list except Exception, e: print traceback.print_exc() # 1-1 PMC 获取文章作者 def get_article_author1(self): try: res_list = [] for article in self.root: try: author_list = [] #print pubmed_article #print "---------------------------------------------------" front = article.findall("front")[0] article_meta = front.findall("article-meta")[0] contrib_group = article_meta.findall("contrib-group")[0] contrib_list = contrib_group.findall("contrib") for contrib in contrib_list: name = contrib.findall("name")[0] surname = name.findall("surname")[0] given_name = name.findall("given-names")[0] final_name = "" final_name += str(given_name.text) + " " + str(surname.text) author_list.append(final_name) #print final_name res_list.append(author_list) except: continue return res_list except Exception, e: print traceback.print_exc() # 2_获得文章标题 def get_article_title(self, root): try: article_title_list = [] for pubmed_article in root: try: medline_citation = pubmed_article.findall("MedlineCitation")[0] article = medline_citation.findall("Article")[0] article_title = article.findall("ArticleTitle")[0] article_title = str(article_title.text) #print article_title article_title_list.append(article_title) except: continue return article_title_list except Exception,e: print traceback.print_exc() # 3_获取年份 def get_article_year(self, root): try: article_year_list = [] cnt = 0 for pubmed_article in root: try: medline_citation = pubmed_article.findall("MedlineCitation")[0] article = medline_citation.findall("Article")[0] article_journal = article.findall("Journal")[0] article_journal_issue = article_journal.findall("JournalIssue")[0] pub_date = article_journal_issue.findall("PubDate")[0] year = pub_date.findall("Year")[0] year = str(year.text) article_year_list.append(year) except: continue return article_year_list except Exception, e: print traceback.print_exc() # 4_获取出版社名称 def get_article_journal_title(self, root): try: journal_title_list = [] for pubmed_article in root: try: medline_citation = pubmed_article.findall("MedlineCitation")[0] article = medline_citation.findall("Article")[0] article_journal = article.findall("Journal")[0] article_journal_title = article_journal.findall("Title")[0] journal_title = str(article_journal_title.text) journal_title_list.append(journal_title) except: continue return journal_title_list except Exception, e: print traceback.print_exc() # 5_pubmed获取文章摘要 def get_article_abstract(self, root): try: article_abstract_list = [] cnt = 0 for pubmed_article in root: try: medline_citation = pubmed_article.findall("MedlineCitation")[0] article = medline_citation.findall("Article")[0] article_abstract = article.findall("Abstract")[0] article_abstract_text = article_abstract.findall("AbstractText")[0] # 考虑有些文章不存在摘要的情形 if article_abstract_text is not None : cnt += 1 abstract = str(article_abstract_text.text) #print cnt, " ", abstract article_abstract_list.append(abstract) except: continue return article_abstract_list except Exception, e: print traceback.print_exc() # 5-1_pmc_获取文章作者 def get_article_abstract1(self): try: res_list = [] for article in self.root: try: author_list = [] # print pubmed_article # print "---------------------------------------------------" front = article.findall("front")[0] article_meta = front.findall("article-meta")[0] abstract = article_meta.findall("abstract")[0] abstract_p = abstract.findall("p")[0] res_list.append(abstract_p.text) except: continue return res_list except Exception, e: print traceback.print_exc() # 6_获取出版社名称 - (名字,位置) def get_article_journal_info(self, root): try: # journal_country_list = [] # journal_name_list = [] journal_info_list = [] for pubmed_article in root: try: medline_citation = pubmed_article.findall("MedlineCitation")[0] journal_info = medline_citation.findall("MedlineJournalInfo")[0] journal_country = str(journal_info.findall("Country")[0].text) journal_name = str(journal_info.findall("MedlineTA")[0].text) journal_info_list.append(journal_name + ',' + journal_country) except: continue return journal_info_list except Exception, e: print traceback.print_exc()#---------------------------------------------------------## 计算统计特征 -##----------------------------------------------------------# # 7_计算每年所发文章数 def cal_num_of_article_in_each_year(self, write_path): try: year_list = self.get_article_year(self.root) counter = dict() #total = len(year_list) #print "TOTAL articles: ", total for y in year_list: if y in counter : counter[y] += 1 else: counter[y] = 1 pairs = list(counter.items()) pairs.sort(reverse=True) outfile = open(write_path, "w") for pair in pairs: line = str(pair[0]) + "\t" + str(pair[1]) outfile.write(line +'\n') outfile.close() except Exception, e: print traceback.print_exc() # 8_pubmed计算文章标题中词频 def cal_word_occurence_in_article_title(self,output_path): try: article_list = self.get_article_title(self.root) stop_words_list = self.get_stop_words(self.stop_words_path) stop_words_list.append(' ') stop_words_list.append('') # 这个要占很大的地方 word_counter = dict() for article in article_list: try: # 预处理 line = "" for ch in article: if ch.isalpha(): line += ch else: line += ' ' article = line article = article.split(' ') for word in article: word = word.lower() if word in stop_words_list: continue if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 except: continue pairs = list(word_counter.items()) items = [(count,word) for (word,count) in pairs] items.sort(reverse=True) write_path = output_path + "word_occurence_in_article_title.txt" outfile = open(write_path,"w") final_str = "" final_freq = "" cnt = 0 for item in items: line = str(item[1]) + "\t" + str(item[0]) outfile.write(line +'\n') if cnt < 10: if cnt == 0: final_str = "'" + item[1] + "'" + final_str final_freq = "'" + str(item[0]) + "'" + final_freq else: final_str = "'" + item[1] + "'" + ',' + final_str final_freq = "'" + str(item[0]) + "'" + ',' + final_freq cnt += 1 final_str = '[' + final_str + ']' final_freq = '[' + final_freq + ']' outfile.write(final_str + '\n') outfile.write(final_freq + '\n') outfile.close() except Exception, e: print traceback.print_exc() # 9_pubmed计算文章摘要中词频 def cal_word_occurence_in_article_abstract(self, output_path): try: abstract_list = self.get_article_abstract(self.root) stop_words_list = self.get_stop_words(self.stop_words_path) stop_words_list.append(' ') stop_words_list.append('') # 这个要占很大的地方 word_counter = dict() for abstract in abstract_list: try: # 预处理 line = "" for ch in abstract: if ch.isalpha(): line += ch else: line += ' ' abstract = line abstract = abstract.split(' ') for word in abstract: word = word.lower() if word in stop_words_list: continue if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 except: continue pairs = list(word_counter.items()) items = [(count, word) for (word, count) in pairs] items.sort(reverse=True) write_path = output_path + "word_occurence_in_article_abstract.txt" outfile = open(write_path, "w") final_str = "" final_freq = "" cnt = 0 for item in items: line = str(item[1]) + "\t" + str(item[0]) outfile.write(line + '\n') if cnt < 10: if cnt == 0: final_str = "'" + item[1] + "'" + final_str final_freq = "'" + str(item[0]) + "'"+ final_freq else: final_str = "'"+item[1]+"'" + ',' + final_str final_freq = "'" + str(item[0]) + "'" + ',' + final_freq cnt += 1 final_str = '[' + final_str + ']' final_freq = '[' + final_freq + ']' outfile.write(final_str + '\n') outfile.write(final_freq + '\n') outfile.close() except Exception, e: print traceback.print_exc() # 9_1_pmc计算文章摘要中词频 def cal_word_occurence_in_article_abstract1(self, write_path): try: abstract_list = self.get_article_abstract1() stop_words_list = self.get_stop_words(self.stop_words_path) stop_words_list.append(' ') stop_words_list.append('') # 这个要占很大的地方 word_counter = dict() for abstract in abstract_list: try: # 预处理 line = "" for ch in abstract: if ch.isalpha(): line += ch else: line += ' ' abstract = line abstract = abstract.split(' ') for word in abstract: word = word.lower() if word in stop_words_list: continue if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 except: continue pairs = list(word_counter.items()) items = [(count, word) for (word, count) in pairs] items.sort(reverse=True) #for item in items: # print item[0], '\t', item[1] outfile = open(write_path, "w") for item in items: try: line = "" line = str(item[1]) + '\t' + str(item[0]) outfile.write(line+'\n') except Exception as ex: print ex outfile.close() except Exception, e: print traceback.print_exc() # 10_计算期刊的名字以及其地理位置的出现次数 def cal_journal_name_and_country_ouucrence(self, country_path, name_path): try: name_counter = dict() country_counter = dict() journal_info_list = self.get_article_journal_info(self.root) for item in journal_info_list: item = item.split(',') journal_name = item[0] journal_country = item[1] if journal_name in name_counter: name_counter[journal_name] += 1 else: name_counter[journal_name] = 1 if journal_country in country_counter: country_counter[journal_country] += 1 else: country_counter[journal_country] = 1 pairs = list(name_counter.items()) reverse_pairs = [ (count,name) for (name,count) in pairs ] reverse_pairs.sort(reverse=True) outfile = open(name_path, "w") for item in reverse_pairs: name = str(item[1]) count = str(item[0]) line = "" line += name line += '\t' line += count outfile.write(line + '\n') outfile.close() pairs = list(country_counter.items()) reverse_pairs = [(count, country) for (country, count) in pairs] reverse_pairs.sort(reverse=True) outfile = open(country_path, "w") for item in reverse_pairs: name = str(item[1]) count = str(item[0]) line = "" line += name line += '\t' line += count outfile.write(line + '\n') outfile.close() except Exception, e: print traceback.print_exc() # 11_计算发布量前10的论文,在不同区的数量 def cal_num_in_diff_area(self, input_path, out_path): try: area_counter = {} cnt = 0 infile = open(input_path, "r") for line in infile: cnt += 1 if cnt == 1: continue line = line.rstrip('\n').split(' ') num = int(line[1]) area = line[3] if area in area_counter: area_counter[area] += num else: area_counter[area] = num infile.close() outfile = open(out_path, "w") for area in area_counter: line = "" line += str(area) line += " " line += str(area_counter[area]) outfile.write(line + '\n') outfile.close() except Exception, e: print traceback.print_exc() # 12_计算影响因子 def cal_aver_if_factor(self, input_path): try: cnt = 0 infile = open(input_path, "r") total_num = 0 total_factor = 0.0 for line in infile: cnt += 1 if cnt == 1: continue line = line.rstrip('\n').split(' ') num = int(line[1]) factor = float(line[2]) total_num += num total_factor += factor * num infile.close() print total_factor / total_num except Exception, e: print traceback.print_exc() # 13_获取停用词 def get_stop_words(self, stop_words_path): result_list = [] infile = open(stop_words_path, "r") for line in infile: line = line.rstrip('\n') result_list.append(line) infile.close() return result_list # 14_测试函数 def test(self): journal_info_list = self.get_article_journal_info(self.root) print len(journal_info_list) for aa in journal_info_list: print aa
- main.py
#-*- coding:utf-8 -*-from XmlParser import*from MyGraph import*STOP_WORDS_PATH = "../file/stop_words.txt"XML_PATH1 = "../data/PUBMED/LANCET/2006/lancet_2006_1570.xml"#XML_PATH2 = "../data/PUBMED/LANCET/2009/lancet_2009_1516.xml"#OUTPUT_PATH1 = "../output/network_analysis/PUBMED/LANCET/2006/"#OUTPUT_PATH2 = "../output/network_analysis/PUBMED/LANCET/2009/"OUTPUT_PATH3 = "../output/src_output/edge.txt"INPUT_PATH = "../data/src_input/citation.csv"OUTPUT_PATH = "../output/src_output/"# @xml_parser_obj:xml解析后的对象# @OUTPUT_PATH:统计分析之后的输出路径def statical_analysis( xml_parser_obj, OUTPUT_PATH ): try: xml_parser_obj.cal_word_occurence_in_article_abstract(OUTPUT_PATH) xml_parser_obj.cal_word_occurence_in_article_title(OUTPUT_PATH) print "[INFO]: statical_analysis is finished!" except Exception,e: print traceback.print_exc()# @xml_parser_obj:xml解析后的对象# @OUTPUT_PATH: 网络静态分析之后的输出路径def author_collaboration_network_analysis( xml_parser_obj, OUTPUT_PATH ): try: # get the author clique list author_clique_list = xml_parser_obj.get_article_author() # construct the graph based on the author clique list graph = MyGraph() graph.construct_graph(author_clique_list) graph.set_output_path(OUTPUT_PATH) # calculate the statistics graph.cal_num_of_nodes() graph.cal_num_of_edges() graph.cal_degree_distribution() graph.cal_density() # the colloboration network is usually not connected #graph.cal_average_shortest_path_length() graph.cal_average_clustering() graph.write_to_pajek_net1() # 这个函数并不是真的画社团 只是把不同clique画出来而已 画的是整个的图 graph.draw_community() graph.set_max_connected_component_subgraph() graph.draw_max_connected_component_subgraph() graph.cal_average_shortest_path_length_in_max_connected_component_subgraph() #graph.draw_graph() #graph.draw_graph_spring_layout() #graph.draw_graph_random() print "[INFO]: author_collaboration_network_analysis is finished!" except Exception,e: print traceback.print_exc()def author_collaboration_network_analysis1( xml_parser_obj1, xml_parser_obj2, OUTPUT_PATH ): try: # get the author clique list author_clique_list = xml_parser_obj1.get_article_author() author_clique_list.extend(xml_parser_obj2.get_article_author()) # construct the graph based on the author clique list graph = MyGraph() graph.construct_graph(author_clique_list) graph.set_output_path(OUTPUT_PATH) # calculate the statistics graph.cal_num_of_nodes() graph.cal_num_of_edges() graph.cal_degree_distribution() graph.cal_density() graph.cal_average_shortest_path_length() graph.cal_average_clustering() graph.write_to_pajek_net1() graph.draw_community() #graph.draw_graph() #graph.draw_graph_spring_layout() #graph.draw_graph_random() print "[INFO]: author_collaboration_network_analysis is finished!" except Exception,e: print traceback.print_exc()def test_for_srx(): try: graph = MyGraph() graph.set_output_path(OUTPUT_PATH) for line in file(INPUT_PATH, "r"): u = line.split(',')[0] v = line.split(',')[1] graph.add_edge(u, v) print "[INFO]: graph is finished!" graph.cal_average_clustering() graph.cal_average_shortest_path_length_in_max_connected_component_subgraph() graph.cal_degree_distribution() graph.cal_density() graph.cal_transitivity() except Exception,e: print traceback.print_exc()def test_for_jcx(): try: graph = MyGraph() graph.set_output_path(OUTPUT_PATH) cnt = 0 for line in file(INPUT_PATH,"r"): u =line.split()[0] v =line.split()[1] graph.add_edge(u,v) cnt += 1 if(cnt == 10000): break; print "[INFO]: graph is finished!" ''' graph.cal_average_clustering() graph.cal_average_shortest_path_length_in_max_connected_component_subgraph() graph.cal_degree_distribution() graph.cal_density() graph.cal_transitivity() ''' title = "Social Network - Live Journal" graph.draw_graph(title) except Exception,e: print traceback.print_exc()def main(): try: print "[INFO]: Programme is running......" # parse the xml and get the result #a_obj1 = XmlParser(XML_PATH1, STOP_WORDS_PATH) #a_obj2 = XmlParser(XML_PATH2, STOP_WORDS_PATH) #statical_analysis(a_obj1, OUTPUT_PATH1) #statical_analysis(a_obj2, OUTPUT_PATH2) #author_collaboration_network_analysis(a_obj1, OUTPUT_PATH1) test_for_srx() print "[INFO]: Programme terminated successfully!" except Exception, e: print traceback.print_exc()main()
0 0
- python网络分析-network第一版的封装
- python网络分析-network第一版的封装
- Network in Network 网络分析
- Social Network 社交网络分析
- 实现自己的Ajax对象封装器 -- Kajax --第一版完成
- 社会网络分析(Social Network Analysis)
- Network Analyst网络分析 教程下载
- 网络分析工具Wireshark Network Analyzer
- [Network Analysis] 复杂网络分析总结
- Network Analysis]复杂网络分析总结
- [Network Analysis] 复杂网络分析总结
- [Network Analysis] 复杂网络分析总结
- [Network Analysis] 复杂网络分析总结
- (Paper)Network in Network网络分析
- (Paper)Network in Network网络分析
- 第一版Python程序
- 一个ArcGIS网络分析的最短路径例子||A Network Analyst Shortest Route of ArcGIS
- 社交网络分析(Social Network Analysis)
- OpenSSL的编译
- centos设置Tomcat8自启动
- 半城烟沙
- 搭建RocketMQ服务
- 事务
- python网络分析-network第一版的封装
- 拆东墙
- win7 32位下安装 python 第三方组件 wl 和 egg
- OkHttp使用小记
- R语言中报错:figure margins too large解决
- POJ 2689 区间筛选质数
- 数据结构(C++)——顺序表(线性表)
- 寒假篇10——凌乱的yyy
- ThreadLocal在spring框架中的作用