爬取简书标题和摘要

来源:互联网 发布:react用js和jsx的区别 编辑:程序博客网 时间:2024/06/15 12:11

有些小瑕疵不知道为啥爬取的标题个数与摘要个数不是整个页面的,而且两者也不相等,不过是第一次做出来,简单记录一下啦~~~~

import urllib.requestimport reimport osdef url_open(url):    req=urllib.request.Request(url)    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')    response=urllib.request.urlopen(url)    html=response.read().decode('utf-8')    return htmldef get_title(html):    p=r'<a.*?class="title".*?target="_blank".*?href=".*?">(.*?)</a>'    t=r'<p.*?class="abstract">(.*?)</p>'    titlelist=re.findall(p,html,re.S)    abstractlist=re.findall(t,html,re.S)    '''    for each in titlelist:        print(each)    for each in abstractlist:        print(each)    '''    i=len(titlelist)    with open(r'essay.txt','w') as f:        for x in range(i-1):            f.write(str(x+1)+titlelist[x]+'\n'+abstractlist[x]+'\n')if __name__=='__main__':    os.mkdir("E:\Title")    os.chdir("E:\Title")    url='http://www.jianshu.com/'    get_title(url_open(url))