Python3爬虫：爬取大众点评网北京所有酒店评分信息

来源：互联网发布：博罗网络问政编辑：程序博客网时间：2024/04/29 03:51

学习Python3爬虫实战：爬取大众点评网某地区所有酒店相关信息，我爬取的北京地区的酒店，由于网站更新，原文中的一些方法已经不再适用，我的工作是在该文指导下重写了一个爬虫。

爬虫无非分为这几块：分析目标、下载页面、解析页面、存储内容，其中下载页面不提。

分析目标：如Python3爬虫实战：爬取大众点评网某地区所有酒店相关信息，目的是爬取所有酒店的用户评分信息
解析页面：使用正则表达式和BeautifulSoup两种方式，一般情况都可以使用正则表达式，除非需要分辨特定用户的评论。
存储内容：酒店信息（id和名称）存储在“hotel_dianping.txt”中，酒店的评分信息存储在“id_name+comments.txt”中

Talk is cheap, show me the code.

#coding=utf-8import reimport requestsfrom bs4 import BeautifulSoupaim_url = "http://www.dianping.com/beijing/hotel"basic_url = "http://www.dianping.com"hotel_file = 'hotel_dianping.txt'def download_page(url):    # 伪装请求头部        # 有了Cookie不怕不让爬    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',        'Cookie':'_lxsdk_cuid=15eea339434c8-0d2cff6b34e61c-c313760-100200-15eea339434c8; _lxsdk=15eea339434c8-0d2cff6b34e61c-c313760-100200-15eea339434c8; _hc.v=cec4c6d7-039d-1717-70c0-4234813c6e90.1507167802;\            s_ViewType=1; __mta=218584358.1507168277959.1507176075960.1507176126471.5; JSESSIONID=48C46DCEFE3A390F647F52FED889020D; aburl=1; cy=2; cye=beijing; _lxsdk_s=15eea9307ab-17c-f87-123%7C%7C48',        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',        'Host':'www.dianping.com'    }    data = requests.get(url, headers=headers).content    # 转换编码    data = data.decode('utf-8')    return data# 获取并存储酒店信息（id, name）def getHotelInfo(hotel_file):    # 网站上显示酒店页面有50页，事实上，只能爬取13页，之后的页面为空    for i in range(1, 51):        page = download_page(aim_url)        # 如："action": "click","content":"/shop/8025450","title":"速8酒店"        re_result = re.compile(r'"action": "click","content":"(.*?)","title":"(.*?)"').findall(page)        txt = ""        for x in re_result:            txt += x[0] # /shop/(/d)+格式            txt += ' ' + x[1] # 酒店名称            txt += "\n"        writeToFile(hotel_file, txt)        print("第%d页OK....." % i)        i += 1        # 下一页的网址        aim_url = "http://www.dianping.com/beijing/hotel/p" + str(i)# 往文件中写contentdef writeToFile(file_name, content):    with open(file_name, 'a+', encoding='utf-8') as fp:        fp.write(content)# 获取每个评论页的所有评论def getScore(page):    # 对于评分需要使用BeautifulSoup，直接使用正则表达式无法判断数据是哪个用户的    score_list = []    soup = BeautifulSoup(page, 'html.parser')    comment_rst_list = soup.find_all('div', attrs = {'class': 'comment-rst'})    # 对于各个用户的评论    for comment_rst in comment_rst_list:        rst_list = comment_rst.find_all('span', attrs={'class': 'rst'})        # 记录某个用户的各项评分，默认为零，前五项分别是房间、位置、服务、卫生和设施，最后一项为冗余项。        single_score_dic = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0}        # 对于各个类型的评论        for rst in rst_list:            comment = rst.getText()            type_ = comment[:2]            score = comment[2]            if type_ == "房间":                single_score_dic[0] = score            elif type_ == "位置":                single_score_dic[1] = score            elif type_ == "服务":                single_score_dic[2] = score            elif type_ == "卫生":                single_score_dic[3] = score            elif type_ == "设施":                single_score_dic[4] = score            else:                single_score_dic[5] = score        score_list.append(single_score_dic)    return score_list# 中文字符和英文、数字占用的空间不同，为了输出显示友好，user_name不能简单的以%30s格式输出def setProperFormat(user_name):    re_result = re.compile(r'(\d|[A-Z]|[a-z]|\_)').findall(user_name)    len_eng = len(re_result)    total_len = len(user_name)    len_cha = total_len - len_eng    real_len = len_eng + len_cha * 2    blank_len = 30 - real_len    txt = "%s" % (" " * blank_len + user_name)    return txt# 获取每一条评论def getEveryComment(hotel_file):    # 打开hotel_file文件    with open(hotel_file, 'r', encoding='utf-8') as fp:        num_hotel = 1        # 对于每家酒店        for line in fp:            # 获取酒店url, id和name            hotel_url = line.split(' ')[0]            hotel_name = line.split(' ')[1][:-1] # 去掉最后的'\n'            hotel_id = hotel_url.split('/')[2]            # 设置存储用户评论的文件的文件名            store_file = "%s_%scomments.txt" % (hotel_id, hotel_name)            # 存入header            txt = "%12s%12s%30s%15s%15s%15s%15s%15s%15s\n" % ("hotel_id", "user_id", "user_name", "rate_room", "rate_position", "rate_service", "rate_health", "rate_facility", "rate_others")            writeToFile(store_file, txt)            # 获取评论页url            business_url = basic_url + hotel_url + '/review_more'            page = download_page(business_url)            # 计算出评论页数            total_comments = re.compile(r'全部</a><em class="col-exp">\((\d+)\)</em>', re.DOTALL).findall(page)            print(total_comments)            pages = int(int(total_comments[0]) / 20) + 1            # 对于每一页的评论            for n in range(1, pages+1):                comment_url = business_url + '?pageno=%s' % n                print(comment_url)                page = download_page(comment_url)                # 如：<a target="_blank" title="" href="/member/1158824000">HpointK</a>                # (id, userName)                user_info = re.compile(r'<a target="_blank" title="" href="/member/(\d+)">(.*?)</a>', re.DOTALL).findall(page)                score_list = getScore(page)                txt = ""                try:                    for i, info in enumerate(user_info):                        txt += "%12s%12s" % (hotel_id, info[0])                        txt += setProperFormat(info[1])                        txt += "%15s%15s%15s%15s%15s%15s\n" % (score_list[i][0], score_list[i][1], score_list[i][2], score_list[i][3], score_list[i][4], score_list[i][5])                except  Exception as e:                    print(e)                    print(len(user_info))                    break                # 每次往文件中写网页中的评论                writeToFile(store_file, txt)                print("第%d页已存储，共%d页" % (n, pages))                break            print("第%s家酒店的评论已存储", num_hotel)            num_hotel += 1            breakgetHotelInfo()getEveryComment(hotel_file)

阅读全文

0 0