Python爬取58足浴上网站信息
来源:互联网 发布:linux设置ip网关地址 编辑:程序博客网 时间:2024/04/27 16:03
以58足浴(http://bj.58.com/zuyu/pn1/?PGTID=0d306b61-0000-186a-d0e6-09e79d939b21&ClickID=1)的该网页为例来实战下Xpath。想要爬取的信息包括:标题、类型、临近、转让费、租金、面积。
1. 使用基础库完成
先不使用框架,自己手写爬取该页的代码:
# -*- coding: utf-8 -*-import codecsimport reimport pandas as pdimport sysfrom lxml import etreeimport requestsreload(sys)sys.setdefaultencoding("utf-8")# print res# print type(res)# with codecs.open("bj.58.html", mode="wb") as f:# f.write(res)from lxml.etree import _Elementblank = "" # 空字符串colon_en = ":" # 英文冒号colon_zh = u":" # 中文冒号forward_slash = "/" # 正斜杠br_label = "<br>" # 换行标签pattern_space = re.compile("\s+") # 空格pattern_line = re.compile("<br\s*?/?>") # 换行pattern_label = re.compile("</?\w+[^>]*>") # HTML标签def crawl_data(url): data = {"title": [], "kind": [], "approach": [], "trans_fee": [], "rent": [], "area": [] } response = requests.get(url) res = response.content tree = etree.HTML(res) frame = tree.xpath("//*[@id='infolist']/table/tr") # one = frame[0] # print one.xpath(".//text()") # print one.xpath("string()") for one in frame: # 标题提取 method 1 raw_title = blank.join(one.xpath("./td[@class='t']/a/text()")) title = re.sub(pattern_space, blank, raw_title) # print("title: %s" % title) # method 2 # title = one.xpath("string(./td[@class='t']/a)") data["title"].append(title) print("title: %s" % title) # 类型和临近位置提取 raw_kind_and_approach = blank.join(one.xpath("./td[@class='t']/text()")) kind_and_approach = re.sub(pattern_space, blank, raw_kind_and_approach) k_and_a_list = kind_and_approach.split(forward_slash) kind = "" approach = "" for thing in k_and_a_list: if u"类型" in thing: kind = thing.split(colon_en)[1] elif u"临近" in thing: approach = thing.split(colon_en)[1] data["kind"].append(kind) data["approach"].append(approach) print("kind: %s, approach: %s" % (kind, approach)) # 转让费和租金提取 transfer_fee_and_rent = etree.tostring(one.xpath("./td[3]")[0], encoding="utf-8") # print("transfer_fee_and_rent: %s" % transfer_fee_and_rent) t_and_r_list = re.sub(pattern_space, blank, transfer_fee_and_rent).split(br_label) # 针对转让费为面议或租金为面议或都为面议的情况进行处理 t_and_r_list = t_and_r_list if len(t_and_r_list) == 2 else t_and_r_list * 2 transfer_fee = re.sub(pattern_label, blank, t_and_r_list[0]).split(colon_zh)[-1] rent = re.sub(pattern_label, blank, t_and_r_list[1]).split(colon_zh)[-1] data["trans_fee"].append(transfer_fee) data["rent"].append(rent) print("transfer_fee: %s, rent: %s" % (transfer_fee, rent)) # 面积提取 raw_area = etree.tostring(one.xpath("./td[position()=4]")[0], encoding="utf-8") area = re.sub(pattern_label, blank, raw_area) area = re.sub(pattern_space, blank, area) data["area"].append(area) print("area: %s" % area) print("-" * 50) # data.append(item) return datadef write_csv(data, file): df = pd.DataFrame(data) df.to_csv(file, index=False, encoding="gbk")if __name__ == "__main__": url = "http://bj.58.com/zuyu/pn1/?PGTID=0d306b61-0000-186a-d0e6-09e79d939b21&ClickID=1" data = crawl_data(url) out_file = "bj_58.csv" write_csv(data, out_file) # print("data: %s" % data)
运行后结果:
2. 使用Scrapy框架完成
命令行中输入 scrapy startproject tutorial来创建一个tutorial工程。
在items.py中添加一个新的Item:
class ZuYuItem(scrapy.Item): title = scrapy.Field() # 标题 kind = scrapy.Field() # 类型 approach = scrapy.Field() # 临近 transfer_fee = scrapy.Field() # 转让费 rent = scrapy.Field() # 租金 area = scrapy.Field() # 面积
在spiders目录下创建一个名为 bj_58.py 的新的Python文件。内容如下:
# -*- coding: utf-8 -*-import reimport scrapyfrom tutorial.items import ZuYuItemclass BJ58Spider(scrapy.Spider): """ scrapy crawl bj_58 -o res.csv """ name = "bj_58" start_urls = [ "http://bj.58.com/zuyu/pn1/?PGTID=0d306b61-0000-186a-d0e6-09e79d939b21&ClickID=1" ] def parse(self, response): blank = "" # 空字符串 colon_en = ":" # 英文冒号 colon_zh = u":" # 中文冒号 forward_slash = "/" # 正斜杠 br_label = "<br>" # 换行标签 pattern_space = re.compile("\s+") # 空格 pattern_line = re.compile("<br\s*?/?>") # 换行 pattern_label = re.compile("</?\w+[^>]*>") # HTML标签 item = ZuYuItem() frame = response.xpath("//*[@id='infolist']/table/tr") # one = frame[0] # print one.xpath(".//text()").extract() # 提取每个选择器所对应 # print one.xpath("string()").extract_first() for one in frame: # 标题提取 method 1 raw_title = blank.join(one.xpath("./td[@class='t']/a/text()").extract()) title = re.sub(pattern_space, blank, raw_title) # method 2 # title = one.xpath("string(./td[@class='t']/a)").extract_first() item["title"] = title # 类型和临近位置提取 raw_kind_and_approach = blank.join(one.xpath("./td[@class='t']/text()").extract()) kind_and_approach = re.sub(pattern_space, blank, raw_kind_and_approach) k_and_a_list = kind_and_approach.split(forward_slash) kind = "" approach = "" for thing in k_and_a_list: if u"类型" in thing: kind = thing.split(colon_en)[1] elif u"临近" in thing: approach = thing.split(colon_en)[1] item["kind"] = kind item["approach"] = approach # 转让费和租金提取 transfer_fee_and_rent = one.xpath("./td[position()=3]").extract_first() t_and_r_list = re.sub(pattern_space, blank, transfer_fee_and_rent).split(br_label) self.log("title: %s" % title) self.log("t_and_r_list: %s" % t_and_r_list) t_and_r_list = t_and_r_list if len(t_and_r_list) == 2 else t_and_r_list * 2 self.log("t_and_r_list: %s" % t_and_r_list) transfer_fee = re.sub(pattern_label, blank, t_and_r_list[0]).split(colon_zh)[-1] rent = re.sub(pattern_label, blank, t_and_r_list[1]).split(colon_zh)[-1] item["transfer_fee"] = transfer_fee item["rent"] = rent # 面积提取 raw_area = one.xpath("./td[position()=4]").extract_first() area = re.sub(pattern_label, blank, raw_area) item["area"] = area yield item
在命令行中输入 scrapy crawl bj_58 -o res.csv 将结果存入res.csv文件中
0 0
- Python爬取58足浴上网站信息
- Python爬取网站职位信息
- Python爬取网站信息出现Errno 10054
- Python 爬虫学习2爬取租房网站信息
- python爬取网页信息
- python 爬取淘宝信息
- Python爬取国家信息
- Python-爬取网页信息
- Python爬取天气信息
- python爬取网页信息
- python爬取网站图片
- Python爬取网站图片
- 使用python爬取学校门户网站相关信息并格式化输出
- python+beautifulsoup+smtp爬取学院网站的信息公告+邮件发送
- Python爬取信息的方法
- python爬取豆瓣电影信息
- python 登录并爬取淘宝信息
- Python爬取安居客新房信息
- c语言-----用循环语句实现金字塔,左对齐
- Mysql创建用户表并利用存储过程添加100万条随机用户数据
- {题解}[jzoj3864]【JSOI2014】歌剧表演
- AS导入友盟推送时报错 finished with non-zero exit value 2
- ubuntu 16.04 使用docker运行caffe环境
- Python爬取58足浴上网站信息
- 数据库阿里连接池 druid配置详解
- 51单片机f-0循环倒计时程序
- 17-01-17
- SQL server 2008 R2安装问题
- shell之(),(()),``的区别
- SteamVR按键的简单操作
- 机器学习算法之朴素贝叶斯算法
- 【算法分析与设计】各种排序算法的效率对比