TF/05_Nearest_Neighbor_Methods/TF/05_Nearest_Neighbor_Methods

来源:互联网 发布:云主机绑定域名 编辑:程序博客网 时间:2024/05/22 07:09

TF/05_Nearest_Neighbor_Methods

# 05_address_matching.py# Address Matching with k-Nearest Neighbors#----------------------------------## This function illustrates a way to perform# address matching between two data sets.## For each test address, we will return the# closest reference address to it.## We will consider two distance functions:# 1) Edit distance for street number/name and# 2) Euclidian distance (L2) for the zip codesimport randomimport stringimport numpy as npimport tensorflow as tffrom tensorflow.python.framework import opsops.reset_default_graph()# First we generate the data sets we will need# n = Size of created data setsn = 10street_names = ['abbey', 'baker', 'canal', 'donner', 'elm']street_types = ['rd', 'st', 'ln', 'pass', 'ave']random.seed(31)  #make results reproduciblerand_zips = [random.randint(65000,65999) for i in range(5)]# Function to randomly create one typo in a string w/ a probabilitydef create_typo(s, prob=0.75):    if random.uniform(0,1) < prob:        rand_ind = random.choice(range(len(s)))        s_list = list(s)        s_list[rand_ind]=random.choice(string.ascii_lowercase)        s = ''.join(s_list)    return(s)# Generate the reference datasetnumbers = [random.randint(1, 9999) for i in range(n)]streets = [random.choice(street_names) for i in range(n)]street_suffs = [random.choice(street_types) for i in range(n)]zips = [random.choice(rand_zips) for i in range(n)]full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, streets, street_suffs)]reference_data = [list(x) for x in zip(full_streets,zips)]# Generate test dataset with some typostypo_streets = [create_typo(x) for x in streets]typo_full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, typo_streets, street_suffs)]test_data = [list(x) for x in zip(typo_full_streets,zips)]# Now we can perform address matching# Create graphsess = tf.Session()# Placeholderstest_address = tf.sparse_placeholder( dtype=tf.string)test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)ref_address = tf.sparse_placeholder(dtype=tf.string)ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)# Declare Zip code distance for a test zip and reference setzip_dist = tf.square(tf.subtract(ref_zip, test_zip))# Declare Edit distance for addressaddress_dist = tf.edit_distance(test_address, ref_address, normalize=True)# Create similarity scoreszip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))address_sim = tf.subtract(1., address_dist)# Combine distance functionsaddress_weight = 0.5zip_weight = 1. - address_weightweighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))# Predict: Get max similarity entrytop_match_index = tf.argmax(weighted_sim, 1)# Function to Create a character-sparse tensor from stringsdef sparse_from_word_vec(word_vec):    num_words = len(word_vec)    indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]    chars = list(''.join(word_vec))    return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))# Loop through test indicesreference_addresses = [x[0] for x in reference_data]reference_zips = np.array([[x[1] for x in reference_data]])# Create sparse address reference setsparse_ref_set = sparse_from_word_vec(reference_addresses)for i in range(n):    test_address_entry = test_data[i][0]    test_zip_entry = [[test_data[i][1]]]    # Create sparse address vectors    test_address_repeated = [test_address_entry] * n    sparse_test_set = sparse_from_word_vec(test_address_repeated)    feeddict={test_address: sparse_test_set,               test_zip: test_zip_entry,               ref_address: sparse_ref_set,               ref_zip: reference_zips}    best_match = sess.run(top_match_index, feed_dict=feeddict)    best_street = reference_addresses[best_match[0]]    [best_zip] = reference_zips[0][best_match]    [[test_zip_]] = test_zip_entry    print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_))    print('Match  : ' + str(best_street) + ', ' + str(best_zip))
Address: 2308 bakar rd, 65480Match  : 2308 baker rd, 65480Address: 709 bakeo pass, 65480Match  : 709 baker pass, 65480Address: 2273 glm ln, 65782Match  : 2273 elm ln, 65782Address: 1843 donner st, 65402Match  : 1843 donner st, 65402Address: 8769 klm st, 65402Match  : 8769 elm st, 65402Address: 3798 dpnner ln, 65012Match  : 3798 donner ln, 65012Address: 2288 bajer pass, 65012Match  : 2288 baker pass, 65012Address: 2416 epm ln, 65480Match  : 2416 elm ln, 65480Address: 543 abgey ave, 65115Match  : 543 abbey ave, 65115Address: 994 abbey st, 65480Match  : 994 abbey st, 65480