python 爬虫初试
来源:互联网 发布:电脑版淘宝微淘哪里看 编辑:程序博客网 时间:2024/05/29 18:30
1。 )最简单爬虫:
#encoding:utf-8import urllib2ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request("http://www.baidu.com",headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()print htmldata
2。)模拟浏览器:
#encoding:utf-8import urllib2import urllibimport randomua_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"]user_agent = random.choice(ua_list)request = urllib2.Request("http://www.baidu.com")request.add_header("User-Agent",user_agent)response = urllib2.urlopen(request)htmldata = response.read()#print htmldata#返回服务器的响应码print response.getcode()#返回实际数据的实际url,重定向,跳转后的urlprint response.geturl()#返回服务器的响应的信息#print response.info()#返回自己请求时使用的代理print request.get_header("User-agent")#urllib的urlencode()接受的参数是一个字典,转成utf8wd = {"wd":"百度"}m= urllib.urlencode(wd)#转回来:print urllib.unquote(m)
3.)使用xpath抓取
关于xpath的使用:
http://cuiqingcai.com/2621.html
http://blog.csdn.net/MrLevo520/article/details/53158050
[root@VM_131_54_centos pachong]# vi itcast.py [root@VM_131_54_centos pachong]# cat itcast.py import urllibimport urllib2from lxml import etreeurl = "http://www.itcast.cn/channel/teacher.shtml"ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request(url,headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()htmldata = etree.HTML(htmldata)#print htmldatacontent = htmldata.xpath("//div[@class='li_txt']//*")print type(content)print len(content)print content[2].text[root@VM_131_54_centos pachong]# python itcast.py <type 'list'>58215年以上的软件开发、大型软件项目设计和团队管理经验。精通C/C++、pascal、Basic等各种编程语言,精通MySQL、Oracle、SQLServer等关系数据库。讲课深入浅出,将复杂理论转化为通俗易懂的语言,深受学生的好评。
text
[root@VM_131_54_centos pachong]# cat paitcast.py #encoding:utf-8import urllibimport urllib2from lxml import etreeimport sysurl = "http://www.itcast.cn/channel/teacher.shtml"ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request(url,headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()htmldata = etree.HTML(htmldata)#print htmldataallcontent = htmldata.xpath("//div[@class='li_txt']")for x in allcontent: print x.xpath('./h3/text()')[0] print x.xpath('./h4')[0].text sys.exit()[root@VM_131_54_centos pachong]# python paitcast.py 朱老师高级讲师
4.)使用http代理
#encoding:utf-8import urllib2from lxml import etreedef testproxy(): ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"} request = urllib2.Request("http://www.baidu.com") request.add_header("User-Agent",ua_headers) handler = urllib2.ProxyHandler({"http" : "117.65.42.1:51552"}) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) #r = opener.open('http://www.baidu.com') #print(r.read()) try: response = urllib2.urlopen(request,timeout=1) htmldata = response.read() htmldata = etree.HTML(htmldata) content = htmldata.xpath("//title") print content[0].text except Exception as e: print eif __name__ == "__main__": testproxy()
5。)从网站上抓取免费代理
#encoding:utf-8import urllib2import sysfrom lxml import etreeimport timeimport csvdef getHttpProxyFromXicidali(page=1): """ : page:需要爬取几页 int 类型 :return: 返回一个http的代理的集合。 """ proxylist = [] ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"} url = "http://www.xicidaili.com/wt/" #url = "http://www.baidu.com" for x in range(1,page+1): pageurl = url +str(x) print pageurl request = urllib2.Request(pageurl) request.add_header("User-Agent",ua_headers) response = urllib2.urlopen(request) htmldata = response.read() htmldata = etree.HTML(htmldata) content = htmldata.xpath("//tr[@class='odd']") #sys.exit() for x in range(len(content)): #print x proxylist.append([content[x][1].text,content[x][2].text,content[x][4].text,content[x][5].text]) #print proxylist[0][2] == u"" #sys.exit() return proxylist#从文件中读取可用代理def readProxyToFile(path): L = [] with open(path,"rb") as f: sp =csv.reader(f,delimiter=',') for x in sp: L .append(x) return L[1:]def writeProxyToFile(proxylist,path="./proxylist.csv"): """ :param proxylist: getHttpProxyFromXicidali()函数返回值,一个列表。 :param path: 输出的文件 :return: """ with open(path ,'w') as f: #ip地址, 端口, 是否高匿, 类型http/https? fieldnames = ['ip', 'port','ishide','type'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for x in proxylist: if x[2] == u"高匿": writer.writerow({'ip': x[0], 'port': x[1],'ishide':"1",'type':x[3]}) elif x[2] == u"透明": writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "0", 'type': x[3]})#测试每个代理是否可用。def testIsUseful(L): """ : L: 接受一个http 免费代理的list集合。 :return:返回一个可用的代理的list集合。 """ usefulproxy = [] ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"} request = urllib2.Request("http://www.baidu.com") request.add_header("User-Agent", ua_headers) for x in L: httpproxy = str(x[0])+':'+str(x[1]) handler = urllib2.ProxyHandler({"http": httpproxy}) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: response = urllib2.urlopen(request,timeout = 1) htmldata = response.read() htmldata = etree.HTML(htmldata) content = htmldata.xpath("//title") if content[0].text ==u"百度一下,你就知道": usefulproxy.append(x) print 'ok' except Exception as e: print e continue return usefulproxyif __name__ == "__main__": #从西刺网站爬取http代理 #proxylist = getHttpProxyFromXicidali(10) #四页400个代理。一页一百个代理。 #将所有代理写到文件(可用的,超时的,不可用的) #writeProxyToFile(proxylist) #将爬到的所有代理进行检测,过滤出【不超时,可用】代理 #proxylist = testIsUseful(proxylist) #writeProxyToFile(proxylist, path="./usefulproxylist.csv") proxylist = readProxyToFile("./usefulproxylist.csv")
查询手机号码的小程序
#encoding:utf-8import copyimport urllib2import randomimport timeimport sysfrom lxml import etreeimport csvfrom pyecharts import Mapclass GetProxy(object): def getHttpProxyFromXicidali(self,page=1): """ : page:需要爬取几页 int 类型 :return: 返回一个http的代理的集合。 """ proxylist = [] ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"} url = "http://www.xicidaili.com/wt/" # url = "http://www.baidu.com" for x in range(1, page + 1): pageurl = url + str(x) print pageurl request = urllib2.Request(pageurl) request.add_header("User-Agent", ua_headers) response = urllib2.urlopen(request) htmldata = response.read() htmldata = etree.HTML(htmldata) content = htmldata.xpath("//tr[@class='odd']") # sys.exit() for x in range(len(content)): # print x proxylist.append([content[x][1].text, content[x][2].text, content[x][4].text, content[x][5].text]) # print proxylist[0][2] == u"" # sys.exit() return proxylist # 从文件中读取可用代理 def readProxyToFile(self,path): L = [] with open(path, "rb") as f: sp = csv.reader(f, delimiter=',') for x in sp: L.append(x) return L[1:] def writeProxyToFile(self,proxylist, path="./proxylist.csv"): """ :param proxylist: getHttpProxyFromXicidali()函数返回值,一个列表。 :param path: 输出的文件 :return: """ with open(path, 'w') as f: # ip地址, 端口, 是否高匿, 类型http/https? fieldnames = ['ip', 'port', 'ishide', 'type'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for x in proxylist: if x[2] == u"高匿": writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "1", 'type': x[3]}) elif x[2] == u"透明": writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "0", 'type': x[3]}) # 测试每个代理是否可用。 def testIsUseful(self,L): """ : L: 接受一个http 免费代理的list集合。 :return:返回一个可用的代理的list集合。 """ usefulproxy = [] ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"} request = urllib2.Request("http://www.baidu.com") request.add_header("User-Agent", ua_headers) for x in L: httpproxy = str(x[0]) + ':' + str(x[1]) handler = urllib2.ProxyHandler({"http": httpproxy}) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: response = urllib2.urlopen(request, timeout=1) htmldata = response.read() htmldata = etree.HTML(htmldata) content = htmldata.xpath("//title") if content[0].text == u"百度一下,你就知道": usefulproxy.append(x) print 'ok' except Exception as e: print e continue return usefulproxydef getUsefuleProxy(): p = GetProxy() #从西刺网站爬取http代理 #proxylist = p.getHttpProxyFromXicidali(1) #一页一百个代理。 #将所有代理写到文件(可用的,超时的,不可用的) #p.writeProxyToFile(proxylist) #将爬到的所有代理进行检测,过滤出【不超时,可用】代理 #proxylist = p.testIsUseful(proxylist) #p.writeProxyToFile(proxylist, path="./usefulproxylist.csv") proxylist = p.readProxyToFile("./usefulproxylist.csv") return proxylistdef getPhoneList(filepath=None,dbconn=None): """ filepat:数据文件 dbconn:数据库连接(host,ip,user,password,database) return 一个list,该list是去重后的手机号的集合。 """ #数据仅来源于文件: if filepath is not None and dbconn is None: with open(filepath, 'rb') as f: all = f.readlines() return all #数据仅来源于数据库 elif dbconn is not None and filepath is None: pass #如果两者为空,或者都不为空,返回一个None else: return Noneclass AnalyseHtml(): def __init__(self,L,pool,filepath=None,dbconn=None,proxyswitch=True): """ :L:手机号的集合,可以是tuple,可以是list。 :pool ,http,免费,免密的代理池 :filepath 将结果写到一个文件中(手机号1,归属省,归属市,运营商) :dbconn (host,ip,user,password,database,tablename1,tablename2),将结果写到tb1,和tb2中。 """ self.L = L self.pool =[] for x in pool: x = {"http":x} self.pool.append(x) self.pool.append({}) self.filepath = filepath self.dbconn = dbconn self.proxyswitch = proxyswitch self.data = [] self.stime = 3/len(self.pool) print self.pool,"__init__" # copypool的pool的初始副本,用于重新初始。 self.copypool = copy.deepcopy(self.pool) def outPutData(self): if self.filepath is not None and self.dbconn is None: with open(self.filepath,'w') as f: fieldnames = ['number', 'Province', 'City', 'Corp'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for x in self.data: writer.writerow({'number': x[0], 'Province': x[1], 'City': x[2], 'Corp': x[3]}) elif self.dbconn is not None and self.filepath is None: pass #每次调用生成不同的handler. def createHandler(self): while 1: if self.pool ==[]: self.pool = copy.deepcopy(self.copypool) x = self.pool.pop() handler = urllib2.ProxyHandler(x) yield handler #每次生成不同的浏览器agent。 def addAgent(self,request): """ :request: 接受一个requeest. :return: 返回一个修改后的request """ ua_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"] user_agent = random.choice(ua_list) request.add_header("User-Agent",user_agent) return request def getHandler(self): """ :return:返回一个handler """ return next(self.createHandler()) def postToGetData(self): """ :return: (手机号1,归属省,归属市,运营商) """ count = 0 for x in self.L: #x = x[:12] x = str(int(x[:12])) #为什么要这么转???????????????不转都不行???? url = "http://v.showji.com/Locating/showji.com2016234999234.aspx?m="+x+"&output=json" print url while 1: try: count += 1 handler = self.getHandler() opener = urllib2.build_opener(handler) urllib2.install_opener(opener) request = urllib2.Request(url) request = self.addAgent(request) response = urllib2.urlopen(request,timeout=1) time.sleep(self.stime) htmldata = response.read() htmldata = eval(htmldata) self.data.append((htmldata["Mobile"],htmldata["Province"],htmldata["City"],htmldata["Corp"])) with open("./enddata.csv",'a+') as f: sp = csv.writer(f, delimiter=",",quoting=csv.QUOTE_MINIMAL) sp.writerow([htmldata["Mobile"],htmldata["Province"],htmldata["City"],htmldata["Corp"]]) print "countOk:", count break except Exception as e: time.sleep(self.stime) print e count -=1 continue self.outPutData()def creatPool(L): p = [] for x in L: x = str(x[0])+":"+x[1] p.append(x) return pdef countData(L=None,path=None): all = [] if path is not None and L is None: with open(path, 'rb') as f: sp = csv.reader(f,delimiter=",",quotechar='"') for x in sp: all.append(x) return all #数据仅来源于数据库 elif L is not None and path is None: pass #如果两者为空,或者都不为空,返回一个None else: return Noneif __name__ == "__main__": """ # 代理池 proxylist = getUsefuleProxy() #print proxylist #sys.exit() pool = creatPool(proxylist) print pool #sys.exit() phonenumberfile = "C:\\Users\\x\\Desktop\\all.csv" phonelist = getPhoneList(filepath = phonenumberfile) ht = AnalyseHtml(phonelist,pool,filepath="./outputdata.csv") ht.postToGetData() """ t = countData(path="./enddata.csv") p = [] #省份 y = [] #运营商 c = [] #city pp = [] yy = [] cc = [] for x in t: p.append(x[1]) c.append(x[2]) y.append(x[3]) pset = list(set(p)) cset = list(set(c)) yset = list(set(y)) #print len(pset),len(cset),len(yset) for x in pset: pp.append([x,p.count(x)]) for y in cset: cc.append([x,c.count(y)]) for z in yset: yy.append([z,y.count(z)]) #print "here" #print pp,cc,yyvalue = []attr = []for x in pp: value.append(x[1]) attr.append(x[0])print attrmap = Map("Map 结合 VisualMap 示例", width=1200, height=600)map.add("", attr, value, maptype='china', is_visualmap=True, visual_text_color='#000')map.render()
6) post请求
$cat fanyi.py +++++++++++++++++++fanyi.py+++++++++++++++++++++++++++++++++++++++++++++++#encoding:utf8import sysimport urllib2from lxml import etreeimport jsonimport urllibua_header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}#这个是有道的,为了查单个单词。这个最简单。def getWordText(word): url = "http://www.youdao.com/w/eng/"+word+"/#keyfrom=dict2.index" # print "输入错误:\n如:\npython youdao 1 没有空格的单词\npython youdao 2 英文语句" try: request = urllib2.Request(url,headers=ua_header) response = urllib2.urlopen(request) htmldata = response.read() #print "test:",htmldata htmldata = etree.HTML(htmldata) expression_xpath = "/html/body/div[1]/div[2]/div[1]/div[2]/div[2]/div[1]/div[2]/ul/li" zh_text = htmldata.xpath(expression_xpath) #print len(zh_text) #print zh_text for x in zh_text: print x.text except Exception as e: print e#从百度翻译获取的,对句子进行翻译,也可以单词,但是单词解释就详细。def getSentenceText(s): """ s:你要输入的英文句子 """ url = "http://fanyi.baidu.com/v2transapi/" datadict = {"from":"en", "to":"zh", "query":None, "simple_means_flag":'3', } datadict["query"] = s formdata = urllib.urlencode(datadict) try: request = urllib2.Request(url,data=formdata,headers=ua_header) response = urllib2.urlopen(request) htmldata = response.read() jsontext = json.loads(htmldata) result = jsontext["trans_result"]["data"][0]["dst"] print result except Exception as e: print "error:",eif __name__=="__main__": #getHtmlText(word="hello") getSentenceText("hello,world")+++++++++++++++++++++++++++++输出如下++++++++++++++++++++++++$python fanyi.py 你好,世界
“`
阅读全文
0 0
- python爬虫初试
- Python爬虫初试
- Python 网络爬虫 初试
- python 爬虫初试
- Python网络爬虫2 ---- scrapy爬虫架构介绍和初试
- 【简易Python爬虫】 初试爬虫_简易Python图片爬虫实现
- python初试
- Python3爬虫之一初试
- 网络爬虫初试
- 初试网络爬虫(1)
- 初试scrapy编写twitter爬虫
- C++调用Python初试
- Python与Selenium初试
- 初试python网络通信
- 初试 爬虫机器人 + luence全文检索
- scrapy爬虫架构介绍和初试
- Scrapy爬虫框架(一):初试牛刀
- 初试CGI编程--python篇
- Unity Shader 常用函数列表
- 上传文件报错:The current request is not a multipart request] with root cause
- MFC调用exe程序,并等待exe程序运行结束(ShellExecuteEx)
- 科普那些不明觉历的“交互设计方法论”
- Java 开发技术演变的个人心得
- python 爬虫初试
- 概率和期望
- openshift/origin工作记录(3)——Application的删除
- CVS、SVN和Git
- 这样做引导,帮你的用户快速熟悉手势交互
- CMakeLists.txt添加opencv库注意事项
- 3.1 数学:172.Factorial Trailing Zeroes(Leetcode)
- linux查看和关闭后台执行程序
- 渐进增强 VS 优雅降级