用户房源推荐—基于内容的推荐算法(CB)

来源:互联网 发布:window系统编程pdf 编辑:程序博客网 时间:2024/04/28 04:00

CB推荐算法根据用户过去喜欢的产品,为用户推荐和他过去喜欢的产品相似的产品。采用基于特征的空间向量模型,并用最近邻方法进行推荐。
算法步骤:

  • 抽取房源的基本特征,考虑到租房的实际情况,确定的基本特征有价格 (house_price) 面积 (house_area),房屋类型 (house_type),地区 (district)

  • 利用一个用户过去喜欢(及不喜欢)的房源特征数据,来学习出此用户的喜好特征。将各个特征进行分类。其中价格分为10类,面积分10类,房屋类型6类,地区9类利用历史数据统计出每个用户的每个特征中各类别的次数,之后相加取平均,表示某用户的某特征的喜好向量。

  • 价格t1 : (house_price) ,
    面积t2: (house_area) ,
    类型t3: (house_type),
    地区t4: (district)。

cosθ1=T1t1||T1||||t1||cosθ2=T2t2||T2||||t2||cosθ3=T3t3||T3||||t3||cosθ4=T4t4||T4||||t4||

用户在实际租房时,更多的是考虑房屋的价格和地区,因此主观确定4个特征的权重为
ω=[ω1,ω2,ω3,ω4]=[0.35,0.15,0.15,0.35]

(权值可根据实际推荐进行调整)

则每个房源与用户喜好的加权相识度
similar_item=ω1cosθ1+ω2cosθ2+ω3cosθ3+ω4cosθ4
对所有的similar_item进行由大到小排序,取前10作为推荐房源。

#coding:utf-8import pyodbcimport timeimport numpy as npclass recommend_house:    '''房源推荐算法类'''    def house_data(self):        '''        函数功能:获取数据库中收藏用户的房源数据        参数:无        返回值:所有房源数据:rent_house_info,用户收藏房源数据:user_house_info,用户id列表:collect_user_id        '''        cnxn = pyodbc.connect('DSN=zjx;UID=root')        cursor = cnxn.cursor()        sql = "select DISTINCT uid,h_id from test.shoucang where  h_type = 1 order by uid"        cursor.execute(sql)        user_info = cursor.fetchall()        sql = "select id, house_price, house_area, house_type, district,status from test.house_rent_info_geren"        cursor.execute(sql)        rent_house_info = cursor.fetchall()        user_house_info = []        each_user_info = []        now_id = user_info[0][0]        late_id = user_info[0][0]        each_user_info.append([user_info[0][0],user_info[0][1]])        count = 0        collect_user_id = []        for item in user_info[1:]:            now_id = item[0]            if now_id  == late_id:                each_user_info[count].append(item[1])            else:                count = count + 1                each_user_info.append([item[0],item[1]])            late_id = item[0]        count = 0        for item in each_user_info:            user_id = item[0]            collect_user_id.append(item[0])            flag = True            for house_id in item[1:]:                for each_house in rent_house_info:                    if each_house[0] == house_id:                        if flag :                            user_house_info.append([each_house[1:]])                            flag = False                        else:                            user_house_info[count].append(each_house[1:])                        break            count = count + 1        return rent_house_info, user_house_info, collect_user_id    def cosine_similarity(self, vector_A, vector_B,len_vector_A):        '''        函数功能:计算两向量的余弦相似度        参数:向量vector_A,vector_B        返回值:两向量的余弦相似度        '''        for i in range(len(vector_B)):            if vector_B[i] == 1:                index = i                break        vector_inner = vector_A[index]        vector_cos = vector_inner/(len_vector_A)        return vector_cos    def price_classify(self, count1, price):        '''        函数功能:对房源价格进行分类        参数:计数count1和房源价格        返回值:计数count1        '''        if price <= 1000:            count1[0] = count1[0] + 1        elif 1000 < price <= 1500:            count1[1] = count1[1] + 1        elif 1500 < price <= 2000:            count1[2] = count1[2] + 1        elif 2000 < price <= 2500:            count1[3] = count1[3] + 1        elif 2500 < price <= 3000:            count1[4] = count1[4] + 1        elif 3000 < price <= 3500:            count1[5] = count1[5] + 1        elif 3500 < price <= 4000:            count1[6] = count1[6] + 1        elif 4000 < price <= 4500:            count1[7] = count1[7] + 1        elif 4500 < price <= 5000:            count1[8] = count1[8] + 1        else:            count1[9] = count1[9] + 1        return count1    def area_classify(self, count2, area):        '''        函数功能:对房源面积进行分类        参数:计数count2和房源面积        返回值:计数count2        '''        if area <= 20:            count2[0] = count2[0] + 1        elif 20 < area <= 30:            count2[1] = count2[1] + 1        elif 30 < area <= 40:            count2[2] = count2[2] + 1        elif 40 <area <= 50:            count2[3] = count2[3] + 1        elif 50 < area <= 60:            count2[4] = count2[4] + 1        elif 60 < area <= 70:            count2[5] = count2[5] + 1        elif 70 < area <= 80:            count2[6] = count2[6] + 1        elif 80 < area <= 90:            count2[7] = count2[7] + 1        elif 90 < area <= 100:            count2[8] = count2[8] + 1        else:            count2[9] = count2[9] + 1        return count2    def type_classify(self, count3, room_type):        '''        函数功能:对房源类型进行分类        参数:计数count3和房源类型        返回值:计数count3        '''        if room_type.find('1室') > -1:            count3[0] = count3[0] + 1        elif room_type.find('2室') > -1:            count3[1] = count3[1] + 1        elif room_type.find('3室') > -1:            count3[2] = count3[2] + 1        elif room_type.find('4室') > -1:            count3[3] = count3[3] + 1        elif room_type.find('5室') > -1:            count3[4] = count3[4] + 1        else:            count3[5] = count3[5] + 1        return count3    def district_classify(self, count4, room_district):        '''        函数功能:对房源地区进行分类        参数:计数count4和房源地区        返回值:计数count4        '''        if room_district.find('滨江') > -1:            count4[0] = count4[0] + 1        elif room_district.find('西湖') > -1:            count4[1] = count4[1] + 1        elif room_district.find('上城') > -1:            count4[2] = count4[2] + 1        elif room_district.find('下城') > -1:            count4[3] = count4[3] + 1        elif room_district.find('江干') > -1:            count4[4] = count4[4] + 1        elif room_district.find('拱墅') > -1:            count4[5] = count4[5] + 1        elif room_district.find('萧山') > -1:            count4[6] = count4[6] + 1        elif room_district.find('余杭') > -1:            count4[7] = count4[7] + 1        else:            count4[8] = count4[8] + 1        return count4    def count_price(self, collect_house_info):        '''        函数功能:获取某用户的房源价格偏好向量        参数:某用户收藏的房源数据        返回值:某用户的房源价格偏好向量        '''        count1 = [0]*10        vector_price = []        for item in collect_house_info:            count1 = self.price_classify(count1, item[0])        len_collect = float(len(collect_house_info))        for item in count1:            vector_price.append(item/len_collect)        return vector_price    def count_area(self, collect_house_info):        '''        函数功能:获取某用户的房源面积偏好向量        参数:某用户收藏的房源数据        返回值:某用户的房源面积偏好向量        '''        count2 = [0] *10        vector_area = []        area_null = 0        for item in collect_house_info:            try:                item = list(item)                item[1] = int(item[1])                count2 = self.area_classify(count2, item[1])            except:                area_null = area_null + 1        len_collect = float(len(collect_house_info))        for item in count2:            vector_area.append(item/(len_collect - area_null))        return vector_area    def count_type(self, collect_house_info):        '''        函数功能:获取某用户的房源类型偏好向量        参数:某用户收藏的房源数据        返回值:某用户的房源类型偏好向量        '''        count3 = [0]*6        vector_type = []        type_null = 0        for item in collect_house_info:            if item[2] == ''or item[2] == 'null':                type_null = type_null + 1            else:                count3 = self.type_classify(count3, item[2])        len_collect = float(len(collect_house_info))        for item in count3:            vector_type.append(item/(len_collect - type_null))        return vector_type    def count_district(self, collect_house_info):        '''        函数功能:获取某用户的房源地区偏好向量        参数:某用户收藏的房源数据        返回值:某用户的房源地区偏好向量        '''        count4 = [0] * 9        vector_district = []        district_null = 0        for item in collect_house_info:            if item[3] == 'null':                district_null = district_null + 1            else:                count4 = self.district_classify(count4, item[3])        len_collect = float(len(collect_house_info))        for item in count4:            vector_district.append(item/(len_collect - district_null))        return vector_district    def CB_recommend(self, similar_weight):        '''        函数功能:计算每个房源与用户偏好向量之间的加权相似度        参数:存放加权相似度的列表similar_weight        返回值:加权相似度的列表similar_weight        '''        '''分别计算每个用户各特征的偏好向量'''        vector_price = self.count_price(collect_house_info)        vector_area = self.count_area(collect_house_info)        vector_type = self.count_type(collect_house_info)        vector_district = self.count_district(collect_house_info)        price_cos_sim = []        area_cos_sim = []        type_cos_sim = []        district_cos_sim = []        for item in rent_house_info:            if item[5] == 1:                similar_weight.append(-9999)            else:                count1 = [0] * 10                count2 = [0] * 10                count3 = [0] * 6                count4 = [0] * 9                if item[1] is None:                    sim_price = -9999                    price_cos_sim.append(sim_price)                else:                    count1 = self.price_classify(count1, item[1])                    len_vector_A = np.sqrt(np.inner(vector_price, vector_price))                    sim_price = self.cosine_similarity(vector_price, count1,len_vector_A)                    price_cos_sim.append(sim_price)                try:                    item[2] = int(item[2])                    count2 = self.area_classify(count2, item[2])                    len_vector_A = np.sqrt(np.inner(vector_area, vector_area))                    sim_area = self.cosine_similarity(vector_area, count2, len_vector_A )                    area_cos_sim.append(sim_area)                except:                    sim_area = -9999                    area_cos_sim.append(sim_area)                if item[3] == '' or item[3] == 'null':                    sim_type = -9999                    type_cos_sim.append(sim_type)                else:                    count3 = self.type_classify(count3, item[3])                    len_vector_A = np.sqrt(np.inner(vector_type, vector_type))                    sim_type = self.cosine_similarity(vector_type, count3, len_vector_A)                    type_cos_sim.append(sim_type)                if item[4] == 'null':                    sim_district = -9999                    district_cos_sim.append(sim_district)                else:                    count4 = self.district_classify(count4, item[4])                    len_vector_A = np.sqrt(np.inner(vector_district, vector_district))                    sim_district = self.cosine_similarity(vector_district, count4, len_vector_A)                    district_cos_sim.append(sim_district)                weight_cos = 0.35 * sim_price + 0.15 * sim_area + 0.15 * sim_type + 0.35 * sim_district                similar_weight.append(weight_cos)        return similar_weightif __name__ == '__main__':    t1 = time.time()    test = recommend_house()    rent_house_info, user_house_info, collect_user_id = test.house_data()    id_num = 0    print "共有用户数:",len(collect_user_id)    print "输出格式为:用户id,前10个推荐房源"    for collect_house_info in user_house_info:        similar_weight = []        print collect_user_id[id_num],        similar_weight = test.CB_recommend(similar_weight)        b  = zip(similar_weight,range(len(similar_weight)))        b.sort(key = lambda x : x[0],reverse=True)        index = [x[1] for x in b]        values = [x[0] for x in b]        for item in index[:9]:            print rent_house_info[item][0],        print rent_house_info[index[9]][0]        id_num = id_num + 1    t2 = time.time()    print "运行时间:",t2 - t1
0 1