使用代理的爬虫小程序

来源:互联网 发布:继续教育网络培训总结 编辑:程序博客网 时间:2024/06/04 19:04

使用使用ip117.135.250.134端口80作为代理服务器,爬取了百度首页的代码。

import urllib.requestimport osimport sysimport redef testArgument(url):        TP=TestProxy(url)def tipUse():    print('改程序只能输入一个参数,这个参数必须是可用的proxy')    print('usage:python test Urllib2WithProxy.py http//1.2.3.4:5')    print('usage:python test Urllib2WithProxy.py https//1.2.3.4:5')class TestProxy(object):    def __init__(self,proxy):        self.proxy = proxy        self.checkProxyFormat(self.proxy)        self.url = 'http://www.baidu.com'        self.timeout=5        self.flagWord='百度'        self.useProxy(self.proxy)    def checkProxyFormat(self,proxy):        try:            proxyMatch = re.compile('http[s]?://[\d]{1,3}\.{\d}{1,3}\.[\d]{1,3}:[\d]{1,5}$')            proxyMatch.match(proxy)        except AttributeError:            tipUse()            exit()        flag = 1        proxy = proxy.replace('//','')        try:            protocol = proxy.split(':')[0]            ip = proxy.split(':')[1]            port = proxy.split(':')[2]        except IndexError:            print('下标出界')            tipUse()            exit()        flag = flag and len(proxy.split(':')) and len(ip.split('.'))        flag = ip.split('.')[0] in map(str,range(1,256)) and flag        flag = ip.split('.')[1] in map(str,range(256)) and flag        flag = ip.split('.')[2] in map(str,range(256)) and flag        flag = ip.split('.')[3] in map(str,range(1,255)) and flag        flag = protocol in ['http','https'] and flag        flag = port in map(str,range(1,65535)) and flag        if flag:            print('输入的http代理服务器符合标准')        else:            tipUse()            exit()                def useProxy(self,proxy):        protocol = proxy.split('//')[0].replace(':','')        ip = proxy.split('//')[1]        opener = urllib.request.build_opener(urllib.request.ProxyHandler({protocol:ip}))        urllib.request.install_opener(opener)        try:            response = urllib.request.urlopen(self.url,timeout=self.timeout)        except:            print('连接错误,退出程序')            exit()                data = response.read()        data = data.decode('UTF-8')        print(data)testArgument('https://117.135.250.134:80')


原创粉丝点击