【Python网络爬虫】python网络数据采集读书笔记(第三章)

来源:互联网 发布:磁力链接转种子软件 编辑:程序博客网 时间:2024/06/11 16:56

python网络数据采集

第三章 开始采集

demo1

遍历单个域名

from urllib.request import urlopenfrom bs4 import BeautifulSoupimport rehtml=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")bsobj=BeautifulSoup(html)for link in bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")):    if 'href' in link.attrs:        print(link.attrs['href'])print(type(bsobj.find('div',{'id':'bodyContent'})))  #<class 'bs4.element.Tag'>print(type(bsobj.findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>print(type(bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>

demo2

使用getlinks函数来获取url

from urllib.request import urlopenfrom bs4 import BeautifulSoupimport reimport datetimeimport randomdef getLinks(articleUrl):    html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")    bsobj=BeautifulSoup(html)    return bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$"))if __name__=='__main__':    random.seed ( datetime.datetime.now ( ) )    links=getLinks("/wiki/Kevin_Bacon")    while len(links)>0:        newArticle=links[random.randint(0,len(links)-1)].attrs['href']        print(newArticle)        links=getLinks(newArticle)

demo3

将获取到的url保存到数据库中,自己提前建立好数据库

import mysql.connector  #导入模块from urllib.request import urlopenfrom bs4 import BeautifulSoupimport reimport datetimeimport randomdef insertdatabase(url):    # 设置连接属性    config = {        'host'     : '127.0.0.1' ,        'user'     : 'root' ,        'password' : '1234' ,        'port'     : '3306' ,        'database' : 'url_save' ,        'charset'  : 'utf8'  # utf8  没有-    }    database = mysql.connector.connect ( **config )    cur=database.cursor()    # 获取连接的cursor    cur.execute("create table if not EXISTS url_table_"+url[2:5]+"(ID int PRIMARY KEY auto_increment,URL VARCHAR(200));")    cur.execute("insert into url_table_"+url[2:5]+"(URL) VALUES('%s')"%url)    #此处url不能写入到数据库中,values中不要忘记加单引号    print('写入成功')    database.commit()        #不能忘记提交    cur.close()    database.close()def getLinks ( articleUrl ) :        html = urlopen ( "http://en.wikipedia.org/wiki/Kevin_Bacon" )        bsobj = BeautifulSoup ( html )        return bsobj.find ( 'div' , { 'id' : 'bodyContent' } ).findAll ( 'a' ,href = re.compile ( "^(/wiki/)((?!:).)*$" ) )if __name__ == '__main__' :    random.seed ( datetime.datetime.now ( ) )    links = getLinks ( "/wiki/Kevin_Bacon" )    while len ( links ) > 0 :        newArticle = links [ random.randint ( 0 , len ( links ) - 1 ) ].attrs [ 'href' ]        print(newArticle)        insertdatabase(newArticle)        links = getLinks ( newArticle )

demo4

链接去重

from urllib.request import urlopenfrom bs4 import BeautifulSoupimport repages=set()def getLinks(pageUrl):    global pages    html=urlopen('http://en.wikipedia.org'+pageUrl)    bsobj=BeautifulSoup(html)    try:        print(bsobj.h1.get_text())        print(bsobj.find(id='mw-content-text').findAll('p')[0].get_text())        print(bsobj.find(id='ca-edit').find("span").find('a').attrs['href'])    except AttributeError:        print("缺少一些属性")    for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):        if 'href' in link.attrs:            if link.attrs['href'] not in pages:                newPage=link.attrs['href']                print('**************************************\n'+newPage)                pages.add(newPage)                getLinks(newPage)getLinks("")

demo5

几个功能的组合,可以定向的获取內链或者外链,但是在运行中走了一会就提示403错误了。

from urllib.request import urlopenfrom bs4 import BeautifulSoupimport reimport datetimeimport randompages=set()allExtLinks=set()allIntLinks=set()random.seed(datetime.datetime.now())#获取页面内所有內链的列表def getInternalLinks(bsobj,includeUrl):    internalLinks=[]    #找出所有以“/”开头的连接    for link in bsobj.findAll('a',href=re.compile("^(/|.*"+includeUrl+")")):        if link.attrs['href'] is not None:            if link.attrs['href'] not in internalLinks:                if link.attrs['href'].startswith("/"):                    internalLinks.append(link.attrs['href'])                else:                    internalLinks.append(link.attrs['href'])    return internalLinks#获取页面内所有外链的列表def getExternalLinks(bsobj,excludeUrl):    externalLinks=[]    #找出所有以“http”或者“www”开头并且不包含当前url的链接    for link in bsobj.findAll('a',href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):        if link.attrs['href'] is not None:            if link.attrs['href'] not in externalLinks:                externalLinks.append(link.attrs['href'])    return externalLinksdef splitAddress(address):    addressParts=address.replace("http://","").split("/")    return addressPartsdef getRandomExternalLink(startingPage):    html=urlopen(startingPage)    bsobj=BeautifulSoup(html)    externalLinks=getExternalLinks(bsobj,splitAddress(startingPage)[0])    if len(externalLinks)==0:        internalLinks=getInternalLinks(bsobj,startingPage)        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])    else:        return externalLinks[random.randint(0,len(externalLinks)-1)]def followExternalOnly(startingSite):    externalLink=getRandomExternalLink(startingSite)    print("随机外链是:"+externalLink+"\n")    followExternalOnly(externalLink)followExternalOnly("http://oreilly.com")

后面关于Scrapy的内容,由于仅支持python2.7版本,所以就先放一放

0 0
原创粉丝点击