Python 聚类分析LinkedIn用户人脉网络

来源:互联网 发布:mac系统安装word文档 编辑:程序博客网 时间:2024/05/22 03:52

CODE:

#!/usr/bin/python # -*- coding: utf-8 -*-'''Created on 2014-8-26@author: guaguastd@name: linkedin_network_clusters.py'''import osimport sysimport jsonfrom urllib2 import HTTPErrorfrom cluster import KMeansClustering, centroid# A helper function to munge data and build up an XML treesys.path.append(os.path.join(os.getcwd(), "e:", "eclipse", "LinkedIn", "dFile"))from mykml import createKMLK = 3# get geo codefrom geo import geo_from_bingg = geo_from_bing()# Load the dataCONNECTIONS_DATA = 'E:\eclipse\LinkedIn\dfile\linkedin_connections.json'OUT_FILE = "E:\eclipse\LinkedIn\dfile\linkedin_clusters_kmeans.kml"# Open up your saved connections with extended profile information# or fetch them again from LinkedIn if you preferconnections = json.loads(open(CONNECTIONS_DATA).read())['values']locations = [c['location']['name'] for c in connections if c.has_key('location')]# Some basic transformstransforms = [('Greater ', ''), (' Area', '')]# Step 1 - Tally the frequency of each locationcoords_freqs = {}for location in locations:    if not c.has_key('location'):        continue    # Avoid unnecessary I/O and geo requests by building up a cache    if coords_freqs.has_key(location):        coords_freqs[location][1] += 1        continue    transformed_location = location    for transform in transforms:        transformed_location = transformed_location.replace(*transform)        # Handle potential IO errors with a retry pattern...        while True:            num_errors = 0            try:                results = g.geocode(transformed_location, exactly_one=False)                print results                break            except HTTPError, e:                num_errors += 1                if num_errors >= 3:                    sys.exit()                print >> sys.stderr, e                print >> sys.stderr, 'Encountered an urllib2 error. Trying again...'        if results is None:            continue                for result in results:            # Each result is of the form ("Description", (X,Y))            coords_freqs[location] = [result[1], 1]            break # Disambiguation strategy is "pick first"# Step 2 - Build up data structure for converting locations to KMLexpanded_coords = []for label in coords_freqs:    # Flip lat/lon for Google Earth    ((lat, lon), f) = coords_freqs[label]    expanded_coords.append((label, [(lon, lat)] * f))    # No need to clutter the map with unnecessary placemarks...    kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label, coords) in expanded_coords]    # It would also be helpful to include names of your contacts on the map    for item in kml_items:        item['contacts'] = '\n'.join(['%s %s.' % (c['firstName'], c['lastName'])            for c in connections if c.has_key('location') and                                    c['location']['name'] == item['label']])# Step 3 - Cluster locations and extend the KML data structure with centroidsc1 = KMeansClustering([coords for (label, coords_list) in expanded_coords                      for coords in coords_list])centroids = [{'label':'CONTROID', 'coords': '%s,%s' % centroid(c)} for c in c1.getclusters(K)]kml_items.extend(centroids)# Step 4 - Create the final KML output and write it to a filekml = createKML(kml_items)f = open(OUT_FILE, 'w')f.write(kml)f.close()print 'Data written to ' + OUT_FILE

RESULT:

[Location(Beijing, Beijing, China 39 54m 0.0s N, 116 23m 0.0s E)][Location(Beijing, Beijing, China 39 54m 0.0s N, 116 23m 0.0s E)]None[Location(CA, United States 37 43m 0.0s N, 122 15m 0.0s W)][Location(Birmingham, England, United Kingdom 52 29m 0.0s N, 1 55m 0.0s W), Location(Birmingham, England, United Kingdom 52 27m 0.0s N, 1 43m 0.0s W), Location(Birmingham Airport, England, United Kingdom 52 27m 0.0s N, 1 44m 0.0s W), Location(Birmingham Business Park, England, United Kingdom 52 28m 0.0s N, 1 43m 0.0s W)][Location(Birmingham, England, United Kingdom 52 29m 0.0s N, 1 55m 0.0s W), Location(Birmingham, England, United Kingdom 52 27m 0.0s N, 1 43m 0.0s W), Location(Birmingham Airport, England, United Kingdom 52 27m 0.0s N, 1 44m 0.0s W), Location(Birmingham Business Park, England, United Kingdom 52 28m 0.0s N, 1 43m 0.0s W)][Location(China 36 33m 0.0s N, 103 59m 0.0s E)][Location(China 36 33m 0.0s N, 103 59m 0.0s E)][Location(Chengdu, Sichuan, China 30 40m 0.0s N, 104 5m 0.0s E)][Location(Chengdu, Sichuan, China 30 40m 0.0s N, 104 5m 0.0s E)][Location(Xingtai, Hebei, China 37 4m 0.0s N, 114 29m 0.0s E)][Location(Xingtai, Hebei, China 37 4m 0.0s N, 114 29m 0.0s E)][Location(United States 39 27m 0.0s N, 98 57m 0.0s W)][Location(United States 39 27m 0.0s N, 98 57m 0.0s W)][Location(Foshan, Guangdong, China 23 2m 0.0s N, 113 6m 0.0s E)][Location(Foshan, Guangdong, China 23 2m 0.0s N, 113 6m 0.0s E)]Data written to E:\eclipse\LinkedIn\dfile\linkedin_clusters_kmeans.kml


0 0
原创粉丝点击