Python爬虫XPATH

来源:互联网 发布:好喝的洋酒推荐 知乎 编辑:程序博客网 时间:2024/03/29 20:17
import urllibimport urllib.requestimport lxmlfrom lxml import etreea=0def loadPage(url):    """    :param url:    :param filename:    :return:    """    request=urllib.request.Request(url=url)    response=urllib.request.urlopen(request)    html=response.read()    # print(html)    content=etree.HTML(html)    #print(content)    #返回的是所有匹配成功后匹配集合threadlist_lz clearfix    #//div[@class="threadlist_lz clearfix"]//a[@"j_th_tit"]/ahref    #//img[@"BDE_imge"]/@src    link_list=content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')    # print(link_list)    for link in link_list:        print(link)        fulllink='http://tieba.baidu.com'+link        # print(fulllink)        loadImage(fulllink)def loadImage(link):    print(link)    request=urllib.request.Request(link)    response=urllib.request.urlopen(request)    html=response.read()    content=etree.HTML(html)    #print(content)    # link_list1=content.xpath('//div[@class="d_post_content j_d_post_content  clearfix]/')    # print(link_list1)    link_list=content.xpath('//img[@class="BDE_Image"]/@src')    print(link_list)    for link in link_list:        writeImage(link)        # print(link)def writeImage(link):    """    # :param link:    :return:    """    request=urllib.request.Request(link)    image=urllib.request.urlopen(request).read()    filename=link[-5:]    with open(filename,'wb')as f:        f.write(image)    print('-'*30)    print('打印是照片' + str(a))    a=a+1def tiebaSpider(url,beginPage,endpPage):    for page in range(beginPage,endpPage+1):        pn=(page-1)*50        # filename='第'+str(page)+'页.html'        fullurl=url+"&pn"+str(pn)        html=loadPage(fullurl)if __name__=="__main__":    kw=input('请输入你要需要爬取的贴吧名:')    beginPage=int(input('请输入起始页'))    endPage=int(input('请输入结束页'))    url='https://tieba.baidu.com/f?'    kw1={'kw':kw}    key = urllib.parse.urlencode(kw1)    fullurl=url+key    print(fullurl)    tiebaSpider(fullurl,beginPage,endPage)