使用网络爬虫爬取新浪二手房网站中的西安二手房信息

来源:互联网 发布:php移除数组中的某一个 编辑:程序博客网 时间:2024/04/27 19:49

      初步学会了使用爬虫来抓取一些简单的数据,学习了《python网络数据采集》后,对使用python采集动态加载页面有了一些初步认识。
      通过抓取新浪二手房网站,对使用selenium采集动态加载页面进行初步学习。
      实现思想为:
      从西安二手房主页面开始,通过将鼠标放置于一级区域上,加载出二级区域后,采集二级区域的url信息存储至url管理器。此处需要使用selenium模块进行采集。
     

#获取小区域链接def get_urls(baseurl,areas):    urls=set()    browser=webdriver.Firefox()    browser.get(baseurl)    for area in areas:        right=browser.find_element_by_xpath("//a[@data-param='"+str(area['data-param'])+"']")        ActionChains(browser).move_to_element(right).perform        html1=browser.page_source        s_soup=BeautifulSoup(html1,'html.parser')        s_area=s_soup.find_all('a',href=re.compile(r'b\d{1,2}-'+area['data-param']))        for s in s_area:            url=s['href']            urls.add(url)    browser.quit()    return urls 

      这里写图片描述
    
      将所有的二级url采集存储完成后,逐个解析每个url页面,采集每个页面上的数据,
      这里写图片描述
      最后使用excel存储采集的数据。
      代码实现如下:
      

#coding=UTF-8import urllib2from bs4 import BeautifulSoupimport refrom selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import xlwt    #下载网页   def download(url):    if url is None:        return None    try:        request=urllib2.Request(url)        request.add_header('User-Agent','Mozilla/5.0' )        response=urllib2.urlopen(request)    except urllib2.URLError,e:        if hasattr(e,'code'):            print e.code        if hasattr(e,'reason'):            print e.reason    if response.getcode()!=200:        return None    return response.read()#获取区域codedef area_data_code(soup):    area_codes=set()    areas=soup.find_all('a',{"data-param":re.compile(r"a\d{1,2}$")})       if areas is None:        return    for area in areas:        area_code=area['data-param']        area_codes.add(area_code)    print len(area_codes)    return area_codes#解析网页def parser(html):    if html is None:        return    soup=BeautifulSoup(html,'html.parser',from_encoding='utf-8')    areas=soup.find_all('a',{"data-param":re.compile(r"a\d{1,2}$")})    if areas is None:        return            return areas#获取小区域链接def get_urls(baseurl,areas):    urls=set()    browser=webdriver.Firefox()    browser.get(baseurl)    for area in areas:        right=browser.find_element_by_xpath("//a[@data-param='"+str(area['data-param'])+"']")        ActionChains(browser).move_to_element(right).perform        html1=browser.page_source        s_soup=BeautifulSoup(html1,'html.parser')        s_area=s_soup.find_all('a',href=re.compile(r'b\d{1,2}-'+area['data-param']))        for s in s_area:            url=s['href']            urls.add(url)    browser.quit()    return urls #判断url解析出几个页面并修改成新的urldef new_urls(page_url):    new_urls=set()    page_html=download(page_url)    page_soup=BeautifulSoup(page_html,'html.parser')    if page_soup.find('div',class_='sorry'):        return None    #获取最大页数    else:        try:            pn_text = page_soup.find('span',class_='no-b').get_text()            max_pn =int(pn_text[1])            print max_pn        except:            return None        for i in range(1,max_pn+1):            new_url=re.sub(r'-','-n'+str(i)+'-',page_url)            new_urls.add(new_url)        return new_urls#采集房源数据def get_page_data(new_page_url):    datas=[]    new_page_html=download(new_page_url)    new_page_soup=BeautifulSoup(new_page_html,'html.parser')    title_list=new_page_soup.find_all('a',{'link-clk':'1'},class_='txt-cut')    plot_list=new_page_soup.find_all('a',class_='mr20 txt-cut cmm-name')    house_info_list=new_page_soup.find_all('span',class_='mr20')    address_list=new_page_soup.find_all('a',href=re.compile(r'http://sx.esf.sina.com.cn/house/'),target='_blank')          price_list=new_page_soup.find_all('span',class_='georgia')    single_price_list=new_page_soup.find_all('div',class_='two')    for i in range(len(title_list)):        data={}        data['title']=title_list[i].string        data['plot_name']=plot_list[i].string        data['house_type']=house_info_list[i*3].string        data['area']=house_info_list[i*3+1].string        data['orientation']=house_info_list[i*3+2].string        data['m_address']=address_list[i*2].string        data['s_address']=address_list[i*2+1].string        price_sub=re.sub(r'\n','',price_list[i].string)        price_sub=re.sub(r' +','',price_sub)        data['price']=price_sub        data['single_price']=single_price_list[i].string        datas.append(data)    return datas #数据输出到excel中     def output(datas):    work=xlwt.Workbook(encoding='utf-8')    sheet=work.add_sheet(u'sheet')    sheet.write(0,0,'标题')    sheet.write(0,1,'小区名')    sheet.write(0,2,'户型')    sheet.write(0,3,'面积')    sheet.write(0,4,'朝向')    sheet.write(0,5,'大区域')    sheet.write(0,6,'小区域')    sheet.write(0,7,'价格')    sheet.write(0,8,'单价')    n=1    for data in datas:        sheet.write(n,0,data['title'].encode('utf8'))        sheet.write(n,1,data['plot_name'].encode('utf8'))        sheet.write(n,2,data['house_type'].encode('utf8'))        sheet.write(n,3,data['area'].encode('utf8'))        sheet.write(n,4,data['orientation'].encode('utf8'))        sheet.write(n,5,data['m_address'].encode('utf8'))        sheet.write(n,6,data['s_address'].encode('utf8'))        sheet.write(n,7,data['price'])        sheet.write(n,8,data['single_price'].encode('utf8'))        n=n+1    work.save(u'secondhouse.xls'.encode('utf8'))if __name__ == '__main__':    baseurl='http://sx.esf.sina.com.cn/house/'    html=download(baseurl)    areas=parser(html)    urls=get_urls(baseurl,areas)    house_datas=[]    while len(urls)!=0:        page_url=urls.pop()        print page_url        new_page_urls=new_urls(page_url)        if new_page_urls is None:            continue        while len(new_page_urls)!=0:            new_page_url=new_page_urls.pop()            print new_page_url            page_datas=get_page_data(new_page_url)            for a in page_datas:                print a             house_datas=house_datas+page_datas                output(house_datas)

      采集到的信息如下:
      这里写图片描述
      最后采集到1000条二手房产信息,数据量略小,搜房网上西安二手房有4000+信息,使用urlopen模块来读取网页出现乱码,尚未解决,有网友建议使用requests模块不会出现网页编码问题,可进行尝试。数据尚未实现可视化,要不要去学一下数据分析。。。。好像很有趣的样子,可是我只是个软件测试工程师啊摔~

0 0
原创粉丝点击