python3 爬虫 实战

来源:互联网 发布:数据库表的设计 编辑:程序博客网 时间:2024/05/22 10:39

python3 爬虫 实战

  • python3 爬虫 xicidailicom 实战
    • 第一步 分析网站
    • 第二步 导包
    • 第三步 访问测试
    • 第四步 伪装好人之模拟浏览器访问
    • 第五步 完善之出错重传
    • 第六步 解析页面数据
    • 第七步 数据保存为excel
    • 第八步 抓取所有页面
    • 第九步 优化效率之用内存来换IO
    • 总结

第一步 分析网站



<html><head><title></title></head><body><table>    <tr>      <th>国家</th>      <th>IP地址</th>      <th>端口</th>      <th>服务器地址</th>      <th>是否匿名</th>      <th>类型</th>      <th>速度</th>      <th>连接时间</th>      <th>存活时间</th>      <th>验证时间</th>    </tr>    <tr>      <td><img src='' alt='Cn' /></td>      <td></td>      <td>61234</td>      <td>        <a>广东湛江</a>      </td>      <td>高匿</td>      <td>HTTP</td>      <td class='country'>        <div title='0.248秒' class='bar'>          <div class='bar_inner fast' style='width:92%'>          </div>        </div>      </td>      <td class='country'>        <div title='0.049秒' class='bar'>          <div class='bar_inner fast' style='width:99%'>          </div>        </div>      </td>      <td>1分钟</td>      <td>17-10-31 09:01</td>    </tr>    <tr>      <td><img src='' alt='Cn' /></td>      <td></td>      <td>23735</td>      <td>        <a>江苏南通</a>      </td>      <td>高匿</td>      <td>HTTPS</td>      <td class='country'>        <div title='0.248秒' class='bar'>          <div class='bar_inner fast' style='width:92%'>          </div>        </div>      </td>      <td class='country'>        <div title='0.049秒' class='bar'>          <div class='bar_inner fast' style='width:99%'>          </div>        </div>      </td>      <td>16分钟</td>      <td>17-10-31 09:00</td>    </tr></table></body></html>

第二步 导包

from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoup

首先这是Python3 标准,所以导入的是urllib.request
openpyxl,如果没有的话’pip install openpyxl’,excel插件

第三步 访问测试


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url):    html = request.urlopen(url).read().decode('utf-8')    print(html)if __name__=='__main__':    url = ''    get_html(url)


Traceback (most recent call last):  ...urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable


第四步 伪装好人之模拟浏览器访问

上面报错503 服务器猜测访问者是坏人,所以拒绝了。那我们加个header,这是模拟浏览器访问该页面的作用。让自己像个好人。

# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    response = request.Request(url=url, headers=headers)    html = request.urlopen(response).read().decode('utf-8')    print(html)if __name__=='__main__':    url = ''    get_html(url)


第五步 完善之出错重传


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        response = request.Request(url=url, headers=headers)        html = request.urlopen(response).read().decode('utf-8')    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    print(html)if __name__=='__main__':    url = ''    get_html(url)

第六步 解析页面数据


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoup# 请填入html_dochtml_doc = def get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        # response = request.Request(url=url, headers=headers)        # html = request.urlopen(response).read().decode('utf-8')        html = html_doc    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    print(html)if __name__=='__main__':    url = ''    get_html(url)


soup = BeautifulSoup(html, 'html.parser')


trs = soup.find_all('tr')


for i in range(1,len(trs)):    tds = trs[i].find_all("td")    if len(tds)==10:        print("国家:" + tds[0].img["alt"])        print("IP地址:" + tds[1].get_text())        print("端口:" + tds[2].get_text())        print("服务器地址:" + tds[3].get_text())        print("是否匿名:" + tds[4].get_text())        print("类型:" + tds[5].get_text())        print("速度:" + tds[6].div["title"])        print("连接时间:" + tds[7].div["title"])        print("存活时间:" + tds[8].get_text())        print("验证时间:" + tds[9].get_text())


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSouphtml_doc = def get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        # response = request.Request(url=url, headers=headers)        # html = request.urlopen(response).read().decode('utf-8')        html = html_doc        soup = BeautifulSoup(html, 'html.parser')        trs = soup.find_all('tr')        for i in range(1, len(trs)):            tds = trs[i].find_all("td")            if len(tds) == 10:                print("国家:" + tds[0].img["alt"])                print("IP地址:" + tds[1].get_text())                print("端口:" + tds[2].get_text())                print("服务器地址:" + tds[3].get_text())                print("是否匿名:" + tds[4].get_text())                print("类型:" + tds[5].get_text())                print("速度:" + tds[6].div["title"])                print("连接时间:" + tds[7].div["title"])                print("存活时间:" + tds[8].get_text())                print("验证时间:" + tds[9].get_text())    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    # print(html)if __name__=='__main__':    url = ''    get_html(url)


国家:CnIP地址:端口:61234服务器地址: 广东湛江 是否匿名:高匿类型:HTTP速度:0.248秒连接时间:0.049秒存活时间:1分钟验证时间:17-10-31 09:01国家:CnIP地址:端口:23735服务器地址: 江苏南通 是否匿名:高匿类型:HTTPS速度:0.248秒连接时间:0.049秒存活时间:16分钟验证时间:17-10-31 09:00进程已结束,退出代码0


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        response = request.Request(url=url, headers=headers)        html = request.urlopen(response).read().decode('utf-8')        soup = BeautifulSoup(html, 'html.parser')        trs = soup.find_all('tr')        for i in range(1, len(trs)):            tds = trs[i].find_all("td")            if len(tds) == 10:                print("---")                print("国家:" + tds[0].img["alt"])                print("IP地址:" + tds[1].get_text())                print("端口:" + tds[2].get_text())                print("服务器地址:" + tds[3].get_text())                print("是否匿名:" + tds[4].get_text())                print("类型:" + tds[5].get_text())                print("速度:" + tds[6].div["title"])                print("连接时间:" + tds[7].div["title"])                print("存活时间:" + tds[8].get_text())                print("验证时间:" + tds[9].get_text())    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    # print(html)if __name__=='__main__':    url = ''    get_html(url)


Traceback (most recent call last):  File "E:/pythonworkplace/com/", line 43, in <module>    get_html(url)  File "E:/pythonworkplace/com/", line 22, in get_html    print("国家:" + tds[0].img["alt"])TypeError: 'NoneType' object is not subscriptable


<tr class="odd">      <td class="country"></td>      <td></td>      <td>808</td>      ...

原来第一个td[0]节点没有子节点img。然后我仔细看了看,原来不只这个数据,其他数据也有缺胳膊少腿的。像我这样追求完美的人,这些数据我就去噪了呀。(哈哈,如果你要的话就加一个判断,if else。)

for i in range(1, len(trs)):    try:        tds = trs[i].find_all("td")        if len(tds) == 10:            print("---")            print("国家:" + tds[0].img["alt"])            print("IP地址:" + tds[1].get_text())            print("端口:" + tds[2].get_text())            print("服务器地址:" + tds[3].get_text())            print("是否匿名:" + tds[4].get_text())            print("类型:" + tds[5].get_text())            print("速度:" + tds[6].div["title"])            print("连接时间:" + tds[7].div["title"])            print("存活时间:" + tds[8].get_text())            print("验证时间:" + tds[9].get_text())    except TypeError as e:        print('get_html_td TypeError:' + e.__str__())        continue


try:    tds = trs[i].find_all("td")    if len(tds) == 10:        print("---")        if tds[0].img:print("国家:" + tds[0].img["alt"])        else:print("国家:")        print("IP地址:" + tds[1].get_text())        print("端口:" + tds[2].get_text())        print("服务器地址:" + tds[3].get_text())        print("是否匿名:" + tds[4].get_text())        print("类型:" + tds[5].get_text())        if tds[6].div:print("速度:" + tds[6].div["title"])        else:print("速度:")        if tds[7].div:print("连接时间:" + tds[7].div["title"])        else:print("连接时间:")        print("存活时间:" + tds[8].get_text())        print("验证时间:" + tds[9].get_text())except TypeError as e:    print('get_html_td TypeError:' + e.__str__())    continue


服务器地址:     江苏南通 


  1. strip():把头和尾的空格去掉
  2. lstrip():把左边的空格去掉
  3. rstrip():把右边的空格去掉
  4. replace(‘c1’,’c2’):把字符串里的c1替换成c2。故可以用replace(’ ‘,”)来去掉字符串里的所有空格
  5. split():通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串
  6. 使用正则表达式


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        response = request.Request(url=url, headers=headers)        html = request.urlopen(response).read().decode('utf-8')        soup = BeautifulSoup(html, 'html.parser')        trs = soup.find_all('tr')        for i in range(1, len(trs)):            try:                tds = trs[i].find_all("td")                if len(tds) == 10:                    print("---")                    if tds[0].img:print("国家:" + tds[0].img["alt"])                    else:print("国家:")                    print("IP地址:" + tds[1].get_text())                    print("端口:" + tds[2].get_text())                    print("服务器地址:" + tds[3].get_text().strip())                    print("是否匿名:" + tds[4].get_text())                    print("类型:" + tds[5].get_text())                    if tds[6].div:print("速度:" + tds[6].div["title"])                    else:print("速度:")                    if tds[7].div:print("连接时间:" + tds[7].div["title"])                    else:print("连接时间:")                    print("存活时间:" + tds[8].get_text())                    print("验证时间:" + tds[9].get_text())            except TypeError as e:                print('get_html_td TypeError:' + e.__str__())                continue    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    # print(html)if __name__=='__main__':    url = ''    get_html(url)


第七步 数据保存为excel

pyhon 操作excel 有很多工具,这里我们使用的是openpyxl,所以请确保您已经安装了excel。


# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoup# html_doc = "<html><head><title></title></head><body><table><tr><th>国家</th><th>IP地址</th><th>端口</th><th>服务器地址</th><th>是否匿名</th><th>类型</th><th>速度</th><th>连接时间</th><th>存活时间</th><th>验证时间</th></tr><tr><td><img src='' alt='Cn' /></td><td></td><td>61234</td><td><a>广东湛江</a></td><td>高匿</td><td>HTTP</td><td class='country'><div title='0.248秒' class='bar'><div class='bar_inner fast' style='width:92%'></div></div></td><td class='country'><div title='0.049秒' class='bar'><div class='bar_inner fast' style='width:99%'></div></div></td><td>1分钟</td><td>17-10-31 09:01</td></tr><tr><td><img src='' alt='Cn' /></td><td></td><td>23735</td><td><a>江苏南通</a></td><td>高匿</td><td>HTTPS</td><td class='country'><div title='0.248秒' class='bar'><div class='bar_inner fast' style='width:92%'></div></div></td><td class='country'><div title='0.049秒' class='bar'><div class='bar_inner fast' style='width:99%'></div></div></td><td>16分钟</td><td>17-10-31 09:00</td></tr>      <tr class='odd'><td class='country'></td><td></td><td>808</td><td>长城宽带</td><td class='country'>高匿</td><td>HTTPS</td><td class='country'><div title='0.16秒' class='bar'><div class='bar_inner fast' style='width:88%'></div></div></td><td class='country'><div title='0.032秒' class='bar'><div class='bar_inner fast' style='width:95%'></div></div></td><td>4天</td><td>17-10-31 11:11</td></tr>       <tr><td><img src='' alt='Cn' /></td><td></td><td>23735</td><td><a>江苏大大</a></td><td>高匿</td><td>HTTPS</td><td class='country'><div title='0.248秒' class='bar'><div class='bar_inner fast' style='width:92%'></div></div></td><td class='country'><div title='0.049秒' class='bar'><div class='bar_inner fast' style='width:99%'></div></div></td><td>16分钟</td><td>17-10-31 09:00</td></tr></table></body></html>"def get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        response = request.Request(url=url, headers=headers)        html = request.urlopen(response).read().decode('utf-8')        # html = html_doc        soup = BeautifulSoup(html, 'html.parser')        trs = soup.find_all('tr')        items = []        for i in range(1, len(trs)):            try:                tds = trs[i].find_all("td")                tds0,tds6,tds7 = '','',''                if len(tds) == 10:                    if tds[0].img: tds0 = tds[0].img["alt"]                    if tds[6].div:tds6 = tds[6].div["title"]                    if tds[7].div:tds7 = tds[7].div["title"]                    item = (tds0,tds[1].get_text(),tds[2].get_text(),                             tds[3].get_text().strip(),tds[4].get_text(),                             tds[5].get_text(),tds6,tds7,tds[8].get_text(),                             tds[9].get_text())                    items.append(item)            except TypeError as e:                print('get_html_td TypeError:' + e.__str__())                continue        print("--page--")        write_excel(items)    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)    # print(html)def write_excel(items):    # 将要保存的excel文件位置    excel_file = r'E:\xicidaili.xlsx'    # 如果尚未创建该excel文件    if not os.path.exists(excel_file):        # 新建一个excel工作簿        wb = openpyxl.Workbook()        # 新建一个sheet        ws =        # 修改sheet名称        ws.title = u'国内高匿代理IP'        # 设置表格头        ws.append(['国家', 'IP地址', '端口', '服务器地址', '是否匿名', '类型',                   '速度', '连接时间', '存活时间', '验证时间'])        # 把items添加至sheet        for item in items:            ws.append(item)    else:        # 如果该excel文件已经存在,即打开该文件        wb = openpyxl.load_workbook(excel_file)        # 选中该sheet        ws = wb.get_sheet_by_name(u'国内高匿代理IP')        # 把items添加至sheet        for item in items:            ws.append(item)    # 保存文件 __name__=='__main__':    url = ''    get_html(url)

这我们多了一个方法write_excel,该方法用于向excel保存items ,items形如[(‘a’,’b’),(‘c’,’d’),(‘e’,’f’)…]
所以我们从get_html获取items 用于传给write_excel

第八步 抓取所有页面

上面已经实现了爬取 一页的100条数据,且保存于excel中。

# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_htmls(url,num):    for i in range(1, num+1):        print(url % i)        get_html(url%i)def get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    try:        response = request.Request(url=url, headers=headers)        html = request.urlopen(response).read().decode('utf-8')        soup = BeautifulSoup(html, 'html.parser')        trs = soup.find_all('tr')        items = []        for i in range(1, len(trs)):            try:                tds = trs[i].find_all("td")                tds0,tds6,tds7 = '','',''                if len(tds) == 10:                    if tds[0].img: tds0 = tds[0].img["alt"]                    if tds[6].div:tds6 = tds[6].div["title"]                    if tds[7].div:tds7 = tds[7].div["title"]                    item = (tds0,tds[1].get_text(),tds[2].get_text(),                             tds[3].get_text().strip(),tds[4].get_text(),                             tds[5].get_text(),tds6,tds7,tds[8].get_text(),                             tds[9].get_text())                    items.append(item)            except TypeError as e:                print('get_html_td TypeError:' + e.__str__())                continue        write_excel(items)    except request.URLError as e:        print('get_html Error:'+e.reason)        html = None        if num_retries>0:            if hasattr(e,'code') and 500<=e.code<600:                # recursively retry 5xx HTTP errors                return get_html(url,num_retries-1)def write_excel(items):    excel_file = r'E:\xicidaili.xlsx'    if not os.path.exists(excel_file):        wb = openpyxl.Workbook()        ws =        ws.title = u'国内高匿代理IP'        ws.append(['国家', 'IP地址', '端口', '服务器地址', '是否匿名', '类型',                   '速度', '连接时间', '存活时间', '验证时间'])        for item in items:            ws.append(item)    else:        wb = openpyxl.load_workbook(excel_file)        ws = wb.get_sheet_by_name(u'国内高匿代理IP')        for item in items:            ws.append(item) __name__=='__main__':    url = ''    get_htmls(url,2488)



第九步 优化效率之用内存来换IO






# _*_ coding:utf‐8 _*from urllib import requestimport osimport openpyxlfrom bs4 import BeautifulSoupdef get_html(url,num_retries = 2):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}    items = []    for page in range(1, 2489):        try:            response = request.Request(url=url%page, headers=headers)            html = request.urlopen(response).read().decode('utf-8')            soup = BeautifulSoup(html, 'html.parser')            trs = soup.find_all('tr')            for i in range(1, len(trs)):                try:                    tds = trs[i].find_all("td")                    tds0,tds6,tds7 = '','',''                    if len(tds) == 10:                        if tds[0].img: tds0 = tds[0].img["alt"]                        if tds[6].div:tds6 = tds[6].div["title"]                        if tds[7].div:tds7 = tds[7].div["title"]                        item = (tds0,tds[1].get_text(),tds[2].get_text(),                                 tds[3].get_text().strip(),tds[4].get_text(),                                 tds[5].get_text(),tds6,tds7,tds[8].get_text(),                                 tds[9].get_text())                        items.append(item)                except TypeError as e:                    print('get_html_td TypeError:' + e.__str__())                    continue            print(url % page)        except request.URLError as e:            print('get_html Error:'+e.reason)            html = None            if num_retries>0:                if hasattr(e,'code') and 500<=e.code<600:                    # recursively retry 5xx HTTP errors                    return get_html(url,num_retries-1)        if page%50==0 or page==2488:            write_excel(items)            items = []            print("---finish "+page.__str__()+"pages---")def write_excel(items):    excel_file = r'E:\xicidaili.xlsx'    if not os.path.exists(excel_file):        wb = openpyxl.Workbook()        ws =        ws.title = u'国内高匿代理IP'        ws.append(['国家', 'IP地址', '端口', '服务器地址', '是否匿名', '类型',                   '速度', '连接时间', '存活时间', '验证时间'])        for item in items:            ws.append(item)    else:        wb = openpyxl.load_workbook(excel_file)        ws = wb.get_sheet_by_name(u'国内高匿代理IP')        for item in items:            ws.append(item) __name__=='__main__':    url = ''    get_html(url)



P S :程序可以正常运行,不过上面的循环好像有点问题… 哈哈


