Python爬取海安网所有号码

来源:互联网 发布:数据采集系统标准规范 编辑:程序博客网 时间:2024/04/28 16:55

注:1.本程序采用MS SQL server数据库,请运行前手动修改程序开始处的数据库连接信息。

2.需要pyodbc库的支持

import requests,re,sysfrom bs4 import BeautifulSoupimport pyodbcpages = set()conn = pyodbc.connect(r'DRIVER={SQL Server};SERVER=WXS-PC;DATABASE=Test;UID=sa')cursor = conn.cursor()#递归爬取整个网站,并将链接存入集合def getLinks(pageUrl):    global pages    r = requests.get(pageUrl , timeout =30)    demo = r.text    bsObj = BeautifulSoup(demo,'html.parser')    #去除外链    for link in bsObj.findAll('a',href=re.compile("(www.haianw.com)+")):        if 'href' in link.attrs:            if link.attrs['href'] not in pages:                newPage = link.attrs['href']                findPhone(newPage)                pages.add(newPage)                getLinks(newPage)#将当前url中的手机号存入字典def findPhone(url):    numbers = {}    r = requests.get(url,timeout=30)    data = r.text    #电话号码和手机号码正则表达式    phone_list = re.findall(r"\d{3}-\d{8}|\d{4}-\d{7}|1[34578]\d{9}",data)    phone_list = list(set(phone_list))    for phone in phone_list:        numbers[phone] = url    writePhone(numbers)    def writePhone(numbers):    global cursor    global conn    for k,v in numbers.items():        temp = "insert into Numbers (link,number) values ('{}','{}')".format(v,k)        cursor.execute(temp)        conn.commit()       if __name__ == '__main__':        #设置递归深度为一百万,防止爬虫崩溃    sys.setrecursionlimit(1000000)    print("开始爬取全站链接...")    try:       getLinks('http://www.haianw.com')    except Exception:        print('爬虫发生崩溃错误,已停止爬取...')


0 0
原创粉丝点击