python网络数据采集学习范例-通过互联网采集

来源:互联网 发布:sql语句求总和 编辑:程序博客网 时间:2024/06/06 04:55
这个程序从http://oreilly.com 开始,然后随机地从一个外链跳到另一个外链
from urllib.request import urlopenfrom bs4 import BeautifulSoupimport reimport datetimeimport randompages = set()random.seed(datetime.datetime.now())# 获取页面所有内链的列表def getInternalLinks(bsObj, includeUrl):    internalLinks = []    # 找出所有以"/"开头的链接    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):        if link.attrs['href'] is not None:            if link.attrs['href'] not in internalLinks:                internalLinks.append(link.attrs['href'])    return internalLinks# 获取页面所有外链的列表def getExternalLinks(bsObj, excludeUrl):    externalLinks = []    # 找出所有以"http"或"www"开头且不包含当前URL的链接    for link in bsObj.findAll("a",                    href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):        if link.attrs['href'] is not None:            if link.attrs['href'] not in externalLinks:                externalLinks.append(link.attrs['href'])    return externalLinksdef splitAddress(address):    addressParts = address.replace("http://", "").split("/")    return addressPartsdef getRandomExternalLink(startingPage):    html = urlopen(startingPage)    bsObj = BeautifulSoup(html,"html.parser")    externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])    if len(externalLinks) == 0:        internalLinks = getInternalLinks(startingPage)        return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])    else:        return externalLinks[random.randint(0, len(externalLinks)-1)]def followExternalOnly(startingSite):    externalLink = getRandomExternalLink("http://oreilly.com")    print("随机外链是:"+externalLink)    followExternalOnly(externalLink)followExternalOnly("http://oreilly.com")