一个简单的定向爬虫代码

来源:互联网 发布:matlab 2012b for mac 编辑:程序博客网 时间:2024/05/16 01:50
import requestsfrom bs4 import BeautifulSoupimport re#it之家定向爬取500+条新闻#以下是rebots协议#User-Agent: Baiduspider#Disallow: /tags/#Disallow: /tag/#Disallow: /ithome/#Disallow: /keywords/#Disallow: /search/#Disallow:/tag/adt_all*#Disallow: /comment/#Disallow: /*?*#Disallow: /?*#Disallow: /html/zixun/#User-Agent: *#Disallow: /ithome/#Disallow: /keywords/#Disallow: /search/#Disallow: /comment/#Disallow: /*?*#Disallow: /?*#Disallow: /html/zixun/#定义一个文章类class Essay:     #文章名称    __essayName = ""    #文章作者    __essayAuthor = ""    #文章分类    __sort = ""    #文章责编    __editor = ""    #发布时间    __releaseTime = ""    def setEssayName(self,name):        self.__essayName = name    def getEssayName(self):        return self.__essayName    def setEssayAuthor(self,author):        self.__essayAuthor = author    def getEssayAuthor(self):        return self.__essayAuthor     def setSort(self,sort):        self.__sort = sort    def getSort(self):        return self.__sort    def setEditor(self,editor):        self.__editor = editor    def getEditor(self):        return self.__editor    def setReleaseTime(self,releaseTime):        self.__releaseTime = releaseTime    def getReleaseTime(self):        return self.__releaseTime#获取url的内容def getHtmlText(url):    try:        kv = {"user-agent":"Chrome/10.0"}        r = requests.get(url, headers=kv,timeout=30)        r.raise_for_status        r.encoding='utf-8' #通过分析网页确认网页编码为utf-8        return r.text    except:        return "error"#获取当前页面内容内的所有指定的url(re:http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm)def getCurrentPageUrl(text,p):    set = {""}    soup = BeautifulSoup(text,"html.parser")    for a in soup.find_all(href=p):        set.add(a.get('href'))      return set#获取当前页面文章数据def getCurrentPageData(text):    soup = BeautifulSoup(text,"html.parser")    obj = Essay()    #发布时间    time = soup.find(id="pubtime_baidu")    if not isinstance(time,type(soup.find("asdfsadfasdf"))):        obj.setReleaseTime(time.string)    #文章名称    name = soup.find(attrs={"class":"post_title"}) if not isinstance(soup.find(attrs={"class":"post_title"}),type(soup.find("asdfsadfasdf"))) else ""    if not name == "":        obj.setEssayName(name.find("h1").string)    #文章作者    author = soup.find(id="author_baidu") if not isinstance(soup.find(id="author_baidu"),type(soup.find("asdfsadfasdf"))) else ""    if not author == "":        obj.setEssayAuthor(author.find("strong").string)    #文章分类    sort = soup.find("div",attrs={"class":"current_nav"}) if not isinstance(soup.find("div",attrs={"class":"current_nav"}),type(soup.find("asdfsadfasdf"))) else ""    if not sort == "":        nav_a = (sort.find_all("a"))        obj.setSort(nav_a[len(nav_a)-1].string)    #文章责编    editor = soup.find(id="editor_baidu") if not isinstance(soup.find(id="editor_baidu"),type(soup.find("asdfsadfasdf"))) else ""    if not editor == "":        obj.setEditor(author.find("strong").string)    return objdef printData(obj):    print("Name:%10s Author:%10s Sort:%10s Editor:%10s Time:%10s"%(obj.getEssayName(),obj.getEssayAuthor(),obj.getSort(),obj.getEditor(),obj.getReleaseTime()))def main():    urlList = []    dataList = []    p = re.compile(r"http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm")    s = getCurrentPageUrl(getHtmlText("http://www.ithome.com/"),p)    urlList = list(s)    if len(s) < 500:        for i in range(0,len(s)):            res = getCurrentPageUrl(getHtmlText(urlList[i]),p)            urlList += list(res)            if len(urlList ) > 500:                break    for url in urlList:        obj = getCurrentPageData(getHtmlText(url) )        printData(obj)        dataList.append(obj)main()
0 0
原创粉丝点击