使用Python工具抓取网页

来源：互联网发布：西西网络图绘制软件编辑：程序博客网时间：2024/09/21 09:29

最近在做一个基于文本分析的项目，需要抓取相关网页进行分析。我使用了Python的request和beautifulsoup组件包抓取和解析网页。在抓取过程中发现了很多问题，这些问题是在抓取工作开始之前，不曾预料到的。比如，由于不同网页的解析过程可能不一致，这可能导致解析失败；再比如，由于访问服务器资源过于频繁，可能会导致connection closed by remote host错误的出现。如下的代码考虑到了这两个问题。

import requestsimport bs4import time# output file nameoutput = open("C:\\result.csv", 'w', encoding="utf-8")# start requestrequest_link = "http://where-you-want-to-crawl-from"response = requests.get(request_link)# parse the htmlsoup = bs4.BeautifulSoup(response.text,"html.parser")# try to get the link starting with hreftry:link = str((soup.find_all('a')[30]).get('href'))except Exception as e_msg:link = 'NULL'# find the related appif (link.startswith("/somewords")):# sleeptime.sleep(2)# request the sub linkresponse = requests.get("some_websites" + link)soup = bs4.BeautifulSoup(response.text,"html.parser")# get the info you want: div label and class is o-content info_you_want = str(soup.find("div", {"class": "o-content"}))try:sub_link = ((str(soup.find("div", {"class": "crumb clearfix"}))).split('</a>')[2]).split('</div>')[0].strip()except Exception as e_msg:sub_link = "NULL_because_exception"try:info_you_want = (info_you_want.split('"o-content">')[1]).split('</div>')[0].strip()except Exception as e_msg:info_you_want = "NULL_because_exception"info_you_want = info_you_want.replace('\n', '')info_you_want = info_you_want.replace('\r', '')# write results into fileoutput.writelines(info_you_want + "\n" + "\n")# not find the aimed linkelse:output.writelines(str(e) + "," + app_name[e] + "\n")output.close()

0 0