myFirstCrawler

来源：互联网发布：伯克级驱逐舰数据编辑：程序博客网时间：2024/05/22 07:08
爬下厨房网页，并且自带翻页功能～～
还能把内容录入到文件里面喔～
废话不多说，上代码：
#get the information(recipes and the chief's name included)of some delicious dishes#and save the information in a filefrom lxml import htmlfrom time import sleepimport osls = os.linesepfilename = "OutputFile.txt"if os.path.exists(filename):    print "ERROR,'%s' already exists!Please name the file again~" % filenamefobj = open(filename, 'w')writeline = "get the information of some delicious dishes"+lsx = html.parse('http://www.xiachufang.com/explore')titles = x.xpath("//ul[@class='list']/li/div/div/p[@class='name']/a/text()")cook = x.xpath("//ul[@class='list']/li/div/div/p[@class='author']/a/text()")status = x.xpath("//ul[@class='list']/li/div/div/p[@class='stats green-font']/span/text()")material = x.xpath("//ul[@class='list']/li/div/div/p[@class='ing ellipsis']/text()")writeline += "We got %s titles with its chief name and status. Here are the top 5:" % len(titles)+lsi = 0j = 5for title in titles:    if i<j:        writeline += "   >"+title+ls        writeline += "   >>chief:"+cook[i]+ls        writeline += "   >>>has been cooked:"+status[i]+"times"+ls        writeline += "   >>>>material:"+material[i]+ls        writeline += "**********************************************************"+ls        i = i + 1    else:        break    #function:next page#assume that 50 titles are enough,writeline += ls+ls+"*********************function: searching next pages***********************"+lsnext_button_xpath = "//a[@class='next']/@href"headline_xpath = "//ul[@class='list']/li/div/div/p[@class='name']/a/text()"newTitles = []base_url = 'http://www.xiachufang.com/{}'next_page = 'http://www.xiachufang.com/explore'threshold = 50while len(newTitles) < threshold and next_page:    x = html.parse(next_page)    headlines = x.xpath(headline_xpath)    writeline += "Retrieved {} titles from url: {}".format(len(headlines), next_page)+ls        newTitles += headlines    next_pages = x.xpath(next_button_xpath)    if next_pages:        next_page = base_url.format(next_pages[0])     else:        writeline += "No next button found"+ls        next_page = None    sleep(3)    if len(newTitles)>=threshold:    writeline += "the number of titles is:%s,enough information!" % len(newTitles)+lswith open(filename, 'wb') as out:    out.write(writeline.encode('utf-8'))    fobj.close() print 'Done!Tada!!'
0 0