spark机器学习练习代码

来源:互联网 发布:台湾奶粉代购 知乎 编辑:程序博客网 时间:2024/06/11 03:30


# coding: utf-8


# In[1]:


user_data=sc.textFile("spark机器学习/数据集/ml-100k/u.user")
user_data.first()




# In[72]:


user_fields=user_data.map(lambda line:line.split("|"))
print user_fields.first()
num_users=user_fields.map(lambda fields:fields[0]).count()
num_genders=user_fields.map(lambda fields:fields[2]).distinct().count()
num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes=user_fields.map( lambda fields:fields[4]).distinct().count()
print "Users:%d, gender: %d, occcupation:%d, zip code: %d" %(num_users, num_genders, num_occupations,num_zipcodes )




# In[10]:


get_ipython().magic(u'pylab inline')
ages=user_fields.map( lambda x: int(x[1])).collect()
hist( ages, bins=20, color='lightblue',normed=True)
fig=matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)




# In[22]:


count_by_occupation=user_fields.map(lambda fields:(fields[3],1)).reduceByKey( lambda x,y:x+y).collect()
x_axis1=np.array(   [c[0]] for c in count_by_occupation  )
y_axis1=np.array(   [c[1]] for c in count_by_occupation  )










# In[24]:


count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()
x_axis1 = np.array([c[0] for c in count_by_occupation])
y_axis1 = np.array([c[1] for c in count_by_occupation])
x_axis = x_axis1[np.argsort(y_axis1)]
y_axis = y_axis1[np.argsort(y_axis1)]


pos = np.arange(len(x_axis))
width = 1.0


ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(x_axis)


plt.bar(pos, y_axis, width, color='lightblue')
plt.xticks(rotation=30)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)




# In[25]:


movie_data=sc.textFile(  "/spark机器学习/数据集/ml-100k/u.item")
print movie_data.first()
num_movie=movie_data.count()
print "movies: %d" % num_movie




# In[29]:


count_by_occupation




# In[28]:


x_axis1




# In[30]:


y_axis1




# In[31]:


def convert_year(x):
    try:
        return int(x[-4:])
    except:
        return 1900




# In[37]:


movie_data=sc.textFile(  "/spark机器学习/数据集/ml-100k/u.item")
print movie_data.first()
num_movie=movie_data.count()
print "movies: %d" % num_movie
movie_fields=movie_data.map(lambda lines:lines.split("|"))
years=movie_fields.map( lambda fields:fields[2]).map(lambda x:convert_year(x))
years_filtered=years.filter(  lambda x:x!=1900)
movie_ages=years_filtered.map( lambda yr:1998-yr).countByValue()
values=movie_ages.values()
bins=movie_ages.keys()
hist( values, bins=bins, color='lightblue', normed=True)
fig=matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)




# In[54]:


rating_data=sc.textFile("/spark机器学习/数据集/ml-100k/u.data")
print rating_data.first()  #xx  xxx   评分  时间
num_ratings=rating_data.count()
print "Ratings: %d" % num_ratings


rating_data=rating_data.map(lambda line: line.split("\t"))
ratings=rating_data.map( lambda fields: int(fields[2]))
max_rating=ratings.reduce( lambda x,y:max(x,y))   #取出最大评分
min_rating=ratings.reduce( lambda x,y:min(x,y))   #取出最小评分
mean_rating=ratings.reduce( lambda x,y: x+y )/num_ratings     #平均评分
median_rating=np.median(ratings.collect() )      #取中位数
ratings_per_user=num_ratings/num_users    #取每位用户的评分次数
ratings_per_movie=num_ratings/num_movie   #取每部电影的平均分


print "Min rating: %d" %min_rating
print "Max rating: %d" %max_rating
print"Average rating:%2.2f" %mean_rating
print "Median rating:%d" % median_rating
print "Average # of ratings per user:%2.2f" % ratings_per_user
print "Average # of ratings per movie:%2.2f" % ratings_per_movie


ratings.stats()    #spark对rdd提供的  stats()函数,也可以进行统计. 










# In[55]:


#以下生成评级值分布的条形图
count_by_rating = ratings.countByValue()  #求出每个评分的评分次数
x_axis = np.array(count_by_rating.keys())  #以1,2,3,4,5这五个分数做为x坐标
y_axis = np.array([float(c) for c in count_by_rating.values()]) #求出五个分数每个分类的评分次数形成y轴值
#让y轴正则化,使它表示为百分比
y_axis_normed = y_axis / y_axis.sum()


pos = np.arange(len(x_axis))
width = 1.0


ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(x_axis)


plt.bar(pos, y_axis_normed, width, color='lightblue')
plt.xticks(rotation=30)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)














# In[56]:


# 对评级以用户id为主键进行分组
user_ratings_grouped=rating_data.map( lambda fields: (int(fields[0]), int(fields[2]))).groupByKey()
#求出每一个主键对应的评级集合的大小,即各用户评级的次数
user_ratings_byuser=user_ratings_grouped.map( lambda(k,v):(k,len(v)))
#取前5个用户评级次数查看
user_ratings_byuser.take(5)    




# In[58]:


#绘制用户评级分布的直方图
user_ratings_byuser_local=user_ratings_byuser.map( lambda(k,v):v).collect()
hist(user_ratings_byuser_local,bins=200,color='lightblue',normed=True)
fig=matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)




# In[71]:


#对发行日期有问题的数据采取填充策略,即用发行日期的中位数来填充问题数据
# 1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
years_pre_processed=movie_fields.map(  lambda fields:fields[2]).map(lambda x: convert_year(x)).collect()
years_pre_processed_array=np.array( years_pre_processed)
#首先计算发行年份的平均数和中位数。    将中位数作为非规整数据的发行年份
mean_year=np.mean( years_pre_processed_array[years_pre_processed_array!=1900])
median_year=np.median(years_pre_processed_array[years_pre_processed_array!=1900])
print '中位数的年份为: %d' %median_year  
print '平均数的年份为: %d' %mean_year


index_bad_data=np.where(  years_pre_processed_array==1900)[0][0]
years_pre_processed_array[index_bad_data]=median_year      #用中位数的年份来填充


print 'index of 1900 after assigning median: %s' % np.where( years_pre_processed_array==1900 )[0]






# In[77]:


#  用户数据结构userfields:     u'1', u'24', u'M', u'technician', u'85711'
#取回occupation的所有可能取值
all_occupations=user_fields.map( lambda fields: fields[3]).distinct().collect()
all_occupations.sort()
#依次对各可能的职业分配序号
idx=0
all_occupations_dict={}
for o in all_occupations:
    all_occupations_dict[o]=idx
    idx+=1
print "doctor的出现次数:%d" %all_occupations_dict['doctor']
print "programmer的出现次数:%d" %all_occupations_dict['programmer']
print "输出所有的职业: %s" % all_occupations


#创建一个长度和可能的职业数目相同的numpy数组,其各元素值为0.这可通过numpy的zeros函数实现
K=len( all_occupations_dict)
binary_x=np.zeros( K  )
k_programmer=all_occupations_dict['programmer']
binary_x[k_programmer]=1
print "二进制特征数组为: %s" % binary_x
print "特征数组长度为: %d" % K




# In[92]:


def extract_datetime( ts ):
    import datetime
    return datetime.datetime.fromtimestamp(ts)


#提取点钟数
timestamps=rating_data.map( lambda fields:int(fields[3]))
hour_of_day=timestamps.map( lambda ts: extract_datetime(ts).hour)
hour_of_day.take(10)






# In[94]:


#将点钟数转划成一天中的不同时段
def assign_tod(hr):
    times_of_day={
        'morning' : range(7,12),
        'lunch' : range(12,14),
        'afternoon' : range(14,18),
        'evening' : range( 18, 23),
        'night' : range(23, 24) + range(0, 7)
       
    }
    for  k, v in times_of_day.iteritems():
        print '%d' %hr
        if hr in v:
            return k
        
time_of_day=hour_of_day.map( lambda hr: assign_tod(hr))
time_of_day.take(10)




# In[100]:


#提取标题
def extract_title(raw):
    import re
    #该表达式找括号间的非单词(数字)
    grps=re.search( "\((\w+)\)",raw)
    if grps:
        #只选取标题部分,并删除最后的空白字符
        return raw[:grps.start()].strip()
    else:
        return raw
    
raw_titles=movie_fields.map( lambda fields:fields[1])
#测试前五个标题的提取
for raw_title in raw_titles.take(5):
    print extract_title( raw_title)
    
#处理原始标题,将标题转化为单词
movie_titles=raw_titles.map( lambda m: extract_title(m))
#下面用简单空白分词法将标题分词为词
title_terms=movie_titles.map( lambda t: t.split(" "))
#输出前五组标题分词后的单词
print title_terms.take(5)


#以上的文本处理很简单,其它的处理如:  转小写,删除标点符号,特殊字符等,删除连接词和词干提取。






# In[102]:


#下面取回所有的可能的词,以构建军一个词到序号的映射字典
all_terms=title_terms.flatMap( lambda x:x).distinct().collect()
#创建一个新的字典来保存词,并分配 k之1序号
idx=0
all_terms_dict={}
for term in all_terms:
    all_terms_dict[term]=idx
    idx+=1


print "单词总数:%d" % len( all_terms_dict )
print "Rooms 的位置:%d" % all_terms_dict['Rooms']




# In[103]:


#以上也可以通过spark的zipWithIndex函数来更高效得到相同的结果. 
all_terms_dict2=title_terms.flatMap( lambda x: x).distinct().zipWithIndex().collectAsMap()
print "单词总数:%d" % len( all_terms_dict2 )
print "Rooms 的位置:%d" % all_terms_dict2['Rooms']




# In[110]:


#创建一个函数,将一个词集合转换成一个稀疏向量的表示。
def create_vector( terms, term_dict):
    from scipy import sparse as sp
    num_terms=len( term_dict )
    x=sp.csc_matrix((1,num_terms))
    for t in terms:
            if t in term_dict:
                idx=term_dict[t]
                x[0,idx]=1
    return x


#对提取出的各个词的rdd的各记录都应用该函数
all_terms_bcast=sc.broadcast( all_terms_dict)
term_vectors=title_terms.map( lambda terms: create_vector(terms, all_terms_bcast.value))
term_vectors.take(5)


# 提取出两个单词的标题对应的向量里也是2个非零元素




# In[112]:


np.random.seed( 42 )
x=np.random.randn(2)
norm_x_2=np.linalg.norm(x)
normalized_x=x/norm_x_2
print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x:\n%s" % normalized_x
print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)




# In[113]:


#使用 mllib机器学习库完成与上面相同的功能
from pyspark.mllib.feature import Normalizer
normalizer=Normalizer()
vector=sc.parallelize( [x])
normalized_x_mllib=normalizer.transform( vector ).first().toArray()
print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x MLlib:\n%s" % normalized_x_mllib
print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x_mllib)




# In[ ]:





0 0
原创粉丝点击