Python http请求（读取代理ip列表）

来源：互联网发布：传感器数据采集系统编辑：程序博客网时间：2024/06/02 03:48
例子：读取代理ip列表

知识点

beautifulsoup4
urllib & urllib2 & httplib2
网络请求
文件操作
#encoding=utf-8# from fib import fibonacci# 这个声明不会把整个fib模块导入到当前的命名空间中，它只会将fib里的fibonacci单个引入到执行这个声明的模块的全局符号表from bs4 import BeautifulSoupimport urllib2import urllib# install beautifulsoup4的步骤# curl http://www.crummy.com/software/BeautifulSoup/bs4/download/4.1/beautifulsoup4-4.1.2.tar.gz >> beautifulsoup4-4.1.2.tar.gz# tar zxvf beautifulsoup4-4.1.2.tar.gz# cd beautifulsoup4-4.1.2# python setup.py install# urllib2和urllib的区别（http://blog.csdn.net/dolphin_h/article/details/45296353）# urllib2可以接受一个Request类的实例来设置URL请求的headers，urllib仅可以接受URL。这意味着，你不可以伪装你的User Agent字符串等    # url = 'http://www.someserver.com/cgi-bin/register.cgi'    # user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'# 将user_agent写入头信息    # values = {'name' : 'who','password':'123456'}    # headers = { 'User-Agent' : user_agent }    # data = urllib.urlencode(values) #{'wd':'D_in'} => wd=D_in    # req = urllib2.Request(url, data, headers)    # response = urllib2.urlopen(req)    # the_page = response.read()    # url = r'http://www.renren.com/ajaxLogin'    #    # #创建一个cj的cookie的容器    # cj = cookielib.CookieJar()    # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))    # #将要POST出去的数据进行编码    # data = urllib.urlencode({"email":email,"password":pass})    # r = opener.open(url,data)    # print cj# urllib提供urlencode方法用来GET查询字符串的产生，而urllib2没有。这是为何urllib常和urllib2一起使用的原因。# 如果你仅做HTTP相关的，看一下httplib2，比其他几个模块好用    # def sendhttp():    #     data = urllib.urlencode({'@number': 12524, '@type': 'issue', '@action': 'show'})    #     headers = {"Content-type": "application/x-www-form-urlencoded",    #                "Accept": "text/plain"}    #     conn = httplib.HTTPConnection('bugs.python.org')    #     conn.request('POST', '/', data, headers)    #     httpres = conn.getresponse()    #     print httpres.status    #     print httpres.reason    #     print httpres.read()    #    # if __name__ == '__main__':    #     sendhttp()of = open('proxy2.txt', 'w')for page in range(1, 2):    print "start", page    # 伪装成浏览器进行访问    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    req = urllib2.Request(url='http://www.xici.net.co/nn', headers=headers)    resp = urllib2.urlopen(req)    # req = urllib.urlopen('http://www.xici.net.co/nn')    # Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8    # User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36    html_doc = resp.read()    # 如果源码中中文为乱码，可以使用：print data.decode('u8')    print 'response code:', resp.getcode()    print 'response header:', resp.info()    print 'resp content:', html_doc    # html_doc = urllib2.urlopen('http://www.xici.net.co/nn/' + str(page)).read()    # print html_doc    # 数据内容（摘要）：    #  <table id="ip_list">    #     <tr>    #       <th></th>    #       <th>国家</th>    #       <th>IP地址</th>    #       <th>端口</th>    #       <th>位置</th>    #       <th>是否匿名</th>    #       <th>类型</th>    #       <th>速度</th>    #       <th>连接时间</th>    #       <th>验证时间</th>    #     </tr>    #     ....    # </table>    soup = BeautifulSoup(html_doc)    trs = soup.find('table', id='ip_list').find_all('tr') #id为ip_list的table，查找其中的tr    print "trs"    for tr in trs[1:]:        # <tr class="">        #     <td></td>        #     <td><img alt="Cn" src="http://fs.xicidaili.com/images/flag/cn.png"/></td>        #     <td>114.105.216.240</td>        #     <td>6675</td>        #     <td>        #             安徽亳州        #           </td>        #     <td>高匿</td>        #     <td>socks4/5</td>        #     <td>        #     <div class="bar" title="1.413秒">        #     <div class="bar_inner fast" style="width:86%">        #     </div>        #     </div>        #     </td>        #     <td>        #     <div class="bar" title="0.412秒">        #     <div class="bar_inner fast" style="width:92%">        #     </div>        #     </div>        #     </td>        #     <td>13-01-09 15:11</td>        # </tr>        print "----------------------------------------\n", tr        tds = tr.find_all('td')        ip = tds[2].text.strip()    # 第2个位ip        port = tds[3].text.strip()        protocol = tds[6].text.strip()        print "=============:\n"        print tds[2].text.strip(), ":", tds[3].text.strip(), " " ,protocol        print "============="        if protocol == 'HTTP' or protocol == 'HTTPS':            of.write('%s=%s:%s\n' % (protocol, ip, port) )            print '%s=%s:%s' % (protocol, ip, port)of.close()
0 0