python 爬虫初试

来源:互联网 发布:电脑版淘宝微淘哪里看 编辑:程序博客网 时间:2024/05/29 18:30

1。 )最简单爬虫:

#encoding:utf-8import urllib2ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request("http://www.baidu.com",headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()print htmldata

2。)模拟浏览器:

#encoding:utf-8import urllib2import urllibimport randomua_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0",           "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.0.0 Safari/537.36",           "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"]user_agent = random.choice(ua_list)request = urllib2.Request("http://www.baidu.com")request.add_header("User-Agent",user_agent)response = urllib2.urlopen(request)htmldata = response.read()#print htmldata#返回服务器的响应码print response.getcode()#返回实际数据的实际url,重定向,跳转后的urlprint response.geturl()#返回服务器的响应的信息#print response.info()#返回自己请求时使用的代理print request.get_header("User-agent")#urllib的urlencode()接受的参数是一个字典,转成utf8wd = {"wd":"百度"}m= urllib.urlencode(wd)#转回来:print urllib.unquote(m)

3.)使用xpath抓取

关于xpath的使用:
http://cuiqingcai.com/2621.html
http://blog.csdn.net/MrLevo520/article/details/53158050

[root@VM_131_54_centos pachong]# vi itcast.py [root@VM_131_54_centos pachong]# cat itcast.py import urllibimport urllib2from lxml import etreeurl = "http://www.itcast.cn/channel/teacher.shtml"ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request(url,headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()htmldata = etree.HTML(htmldata)#print htmldatacontent = htmldata.xpath("//div[@class='li_txt']//*")print type(content)print len(content)print content[2].text[root@VM_131_54_centos pachong]# python itcast.py <type 'list'>58215年以上的软件开发、大型软件项目设计和团队管理经验。精通C/C++、pascal、Basic等各种编程语言,精通MySQL、Oracle、SQLServer等关系数据库。讲课深入浅出,将复杂理论转化为通俗易懂的语言,深受学生的好评。

text

[root@VM_131_54_centos pachong]# cat paitcast.py #encoding:utf-8import urllibimport urllib2from lxml import etreeimport sysurl = "http://www.itcast.cn/channel/teacher.shtml"ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}request = urllib2.Request(url,headers = ua_headers)response = urllib2.urlopen(request)htmldata = response.read()htmldata = etree.HTML(htmldata)#print htmldataallcontent = htmldata.xpath("//div[@class='li_txt']")for x in allcontent:    print x.xpath('./h3/text()')[0]    print x.xpath('./h4')[0].text    sys.exit()[root@VM_131_54_centos pachong]# python paitcast.py 朱老师高级讲师

4.)使用http代理

#encoding:utf-8import urllib2from lxml import etreedef testproxy():    ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}    request = urllib2.Request("http://www.baidu.com")    request.add_header("User-Agent",ua_headers)    handler = urllib2.ProxyHandler({"http" : "117.65.42.1:51552"})    opener = urllib2.build_opener(handler)    urllib2.install_opener(opener)    #r = opener.open('http://www.baidu.com')    #print(r.read())    try:        response = urllib2.urlopen(request,timeout=1)        htmldata = response.read()        htmldata = etree.HTML(htmldata)        content = htmldata.xpath("//title")        print content[0].text    except Exception as e:        print eif __name__ == "__main__":    testproxy()

5。)从网站上抓取免费代理

#encoding:utf-8import urllib2import sysfrom lxml import etreeimport timeimport csvdef  getHttpProxyFromXicidali(page=1):    """    : page:需要爬取几页 int 类型    :return: 返回一个http的代理的集合。    """    proxylist = []    ua_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}    url = "http://www.xicidaili.com/wt/"    #url = "http://www.baidu.com"    for x in range(1,page+1):        pageurl = url +str(x)        print pageurl        request = urllib2.Request(pageurl)        request.add_header("User-Agent",ua_headers)        response = urllib2.urlopen(request)        htmldata = response.read()        htmldata = etree.HTML(htmldata)        content = htmldata.xpath("//tr[@class='odd']")        #sys.exit()        for x in range(len(content)):            #print x            proxylist.append([content[x][1].text,content[x][2].text,content[x][4].text,content[x][5].text])            #print proxylist[0][2] == u""            #sys.exit()    return proxylist#从文件中读取可用代理def readProxyToFile(path):    L = []    with open(path,"rb") as  f:        sp =csv.reader(f,delimiter=',')        for x in sp:            L .append(x)    return L[1:]def writeProxyToFile(proxylist,path="./proxylist.csv"):    """    :param proxylist: getHttpProxyFromXicidali()函数返回值,一个列表。    :param path: 输出的文件    :return:    """    with open(path ,'w') as f:        #ip地址, 端口, 是否高匿, 类型http/https?        fieldnames = ['ip', 'port','ishide','type']        writer = csv.DictWriter(f, fieldnames=fieldnames)        writer.writeheader()        for x in proxylist:            if  x[2] == u"高匿":                writer.writerow({'ip': x[0], 'port': x[1],'ishide':"1",'type':x[3]})            elif x[2] == u"透明":                writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "0", 'type': x[3]})#测试每个代理是否可用。def testIsUseful(L):    """    : L: 接受一个http 免费代理的list集合。    :return:返回一个可用的代理的list集合。    """    usefulproxy = []    ua_headers = {        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}    request = urllib2.Request("http://www.baidu.com")    request.add_header("User-Agent", ua_headers)    for x in L:        httpproxy = str(x[0])+':'+str(x[1])        handler = urllib2.ProxyHandler({"http": httpproxy})        opener = urllib2.build_opener(handler)        urllib2.install_opener(opener)        try:            response = urllib2.urlopen(request,timeout = 1)            htmldata = response.read()            htmldata = etree.HTML(htmldata)            content = htmldata.xpath("//title")            if content[0].text ==u"百度一下,你就知道":                usefulproxy.append(x)                print 'ok'        except Exception as e:            print e            continue    return usefulproxyif __name__ == "__main__":    #从西刺网站爬取http代理    #proxylist = getHttpProxyFromXicidali(10) #四页400个代理。一页一百个代理。    #将所有代理写到文件(可用的,超时的,不可用的)    #writeProxyToFile(proxylist)    #将爬到的所有代理进行检测,过滤出【不超时,可用】代理    #proxylist = testIsUseful(proxylist)    #writeProxyToFile(proxylist, path="./usefulproxylist.csv")    proxylist = readProxyToFile("./usefulproxylist.csv")

查询手机号码的小程序

#encoding:utf-8import copyimport urllib2import randomimport timeimport sysfrom lxml import etreeimport csvfrom pyecharts import Mapclass GetProxy(object):    def getHttpProxyFromXicidali(self,page=1):        """        : page:需要爬取几页 int 类型        :return: 返回一个http的代理的集合。        """        proxylist = []        ua_headers = {            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}        url = "http://www.xicidaili.com/wt/"        # url = "http://www.baidu.com"        for x in range(1, page + 1):            pageurl = url + str(x)            print pageurl            request = urllib2.Request(pageurl)            request.add_header("User-Agent", ua_headers)            response = urllib2.urlopen(request)            htmldata = response.read()            htmldata = etree.HTML(htmldata)            content = htmldata.xpath("//tr[@class='odd']")            # sys.exit()            for x in range(len(content)):                # print x                proxylist.append([content[x][1].text, content[x][2].text, content[x][4].text, content[x][5].text])                # print proxylist[0][2] == u""                # sys.exit()        return proxylist    # 从文件中读取可用代理    def readProxyToFile(self,path):        L = []        with open(path, "rb") as  f:            sp = csv.reader(f, delimiter=',')            for x in sp:                L.append(x)        return L[1:]    def writeProxyToFile(self,proxylist, path="./proxylist.csv"):        """        :param proxylist: getHttpProxyFromXicidali()函数返回值,一个列表。        :param path: 输出的文件        :return:        """        with open(path, 'w') as f:            # ip地址, 端口, 是否高匿, 类型http/https?            fieldnames = ['ip', 'port', 'ishide', 'type']            writer = csv.DictWriter(f, fieldnames=fieldnames)            writer.writeheader()            for x in proxylist:                if x[2] == u"高匿":                    writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "1", 'type': x[3]})                elif x[2] == u"透明":                    writer.writerow({'ip': x[0], 'port': x[1], 'ishide': "0", 'type': x[3]})    # 测试每个代理是否可用。    def testIsUseful(self,L):        """        : L: 接受一个http 免费代理的list集合。        :return:返回一个可用的代理的list集合。        """        usefulproxy = []        ua_headers = {            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}        request = urllib2.Request("http://www.baidu.com")        request.add_header("User-Agent", ua_headers)        for x in L:            httpproxy = str(x[0]) + ':' + str(x[1])            handler = urllib2.ProxyHandler({"http": httpproxy})            opener = urllib2.build_opener(handler)            urllib2.install_opener(opener)            try:                response = urllib2.urlopen(request, timeout=1)                htmldata = response.read()                htmldata = etree.HTML(htmldata)                content = htmldata.xpath("//title")                if content[0].text == u"百度一下,你就知道":                    usefulproxy.append(x)                    print 'ok'            except Exception as e:                print e                continue        return usefulproxydef getUsefuleProxy():    p =  GetProxy()    #从西刺网站爬取http代理    #proxylist = p.getHttpProxyFromXicidali(1) #一页一百个代理。    #将所有代理写到文件(可用的,超时的,不可用的)    #p.writeProxyToFile(proxylist)    #将爬到的所有代理进行检测,过滤出【不超时,可用】代理    #proxylist = p.testIsUseful(proxylist)    #p.writeProxyToFile(proxylist, path="./usefulproxylist.csv")    proxylist = p.readProxyToFile("./usefulproxylist.csv")    return proxylistdef getPhoneList(filepath=None,dbconn=None):    """    filepat:数据文件    dbconn:数据库连接(host,ip,user,password,database)    return 一个list,该list是去重后的手机号的集合。    """    #数据仅来源于文件:    if filepath is not None and dbconn is None:        with open(filepath, 'rb') as f:            all = f.readlines()        return all    #数据仅来源于数据库    elif dbconn is not None and filepath is None:        pass    #如果两者为空,或者都不为空,返回一个None    else:        return Noneclass AnalyseHtml():    def __init__(self,L,pool,filepath=None,dbconn=None,proxyswitch=True):        """        :L:手机号的集合,可以是tuple,可以是list。        :pool ,http,免费,免密的代理池        :filepath  将结果写到一个文件中(手机号1,归属省,归属市,运营商)        :dbconn   (host,ip,user,password,database,tablename1,tablename2),将结果写到tb1,和tb2中。        """        self.L = L        self.pool =[]        for x in pool:            x = {"http":x}            self.pool.append(x)        self.pool.append({})        self.filepath = filepath        self.dbconn = dbconn        self.proxyswitch = proxyswitch        self.data = []        self.stime = 3/len(self.pool)        print self.pool,"__init__"        # copypool的pool的初始副本,用于重新初始。        self.copypool = copy.deepcopy(self.pool)    def outPutData(self):        if self.filepath is not None and self.dbconn is None:            with open(self.filepath,'w') as f:                fieldnames = ['number', 'Province', 'City', 'Corp']                writer = csv.DictWriter(f, fieldnames=fieldnames)                writer.writeheader()                for x in self.data:                    writer.writerow({'number': x[0], 'Province': x[1], 'City': x[2], 'Corp': x[3]})        elif self.dbconn is not None and self.filepath is None:                pass    #每次调用生成不同的handler.    def createHandler(self):        while 1:            if self.pool ==[]:                self.pool = copy.deepcopy(self.copypool)            x = self.pool.pop()            handler = urllib2.ProxyHandler(x)            yield  handler    #每次生成不同的浏览器agent。    def addAgent(self,request):        """        :request: 接受一个requeest.        :return: 返回一个修改后的request        """        ua_list = [            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0",            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.0.0 Safari/537.36",            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"]        user_agent = random.choice(ua_list)        request.add_header("User-Agent",user_agent)        return request    def getHandler(self):        """        :return:返回一个handler        """        return next(self.createHandler())    def postToGetData(self):        """        :return: (手机号1,归属省,归属市,运营商)        """        count = 0        for x in self.L:            #x = x[:12]            x = str(int(x[:12]))  #为什么要这么转???????????????不转都不行????            url = "http://v.showji.com/Locating/showji.com2016234999234.aspx?m="+x+"&output=json"            print url            while 1:               try:                    count += 1                    handler = self.getHandler()                    opener = urllib2.build_opener(handler)                    urllib2.install_opener(opener)                    request = urllib2.Request(url)                    request = self.addAgent(request)                    response = urllib2.urlopen(request,timeout=1)                    time.sleep(self.stime)                    htmldata = response.read()                    htmldata = eval(htmldata)                    self.data.append((htmldata["Mobile"],htmldata["Province"],htmldata["City"],htmldata["Corp"]))                    with open("./enddata.csv",'a+') as f:                        sp = csv.writer(f, delimiter=",",quoting=csv.QUOTE_MINIMAL)                        sp.writerow([htmldata["Mobile"],htmldata["Province"],htmldata["City"],htmldata["Corp"]])                    print "countOk:", count                    break               except Exception as e:                    time.sleep(self.stime)                    print e                    count -=1                    continue        self.outPutData()def creatPool(L):    p = []    for x in L:        x = str(x[0])+":"+x[1]        p.append(x)    return pdef countData(L=None,path=None):    all = []    if path is not None and L is None:        with open(path, 'rb') as f:            sp = csv.reader(f,delimiter=",",quotechar='"')            for x in sp:                all.append(x)        return all    #数据仅来源于数据库    elif L is not None and path is None:        pass    #如果两者为空,或者都不为空,返回一个None    else:        return Noneif __name__ == "__main__":    """    # 代理池    proxylist = getUsefuleProxy()    #print proxylist    #sys.exit()    pool = creatPool(proxylist)    print pool    #sys.exit()    phonenumberfile = "C:\\Users\\x\\Desktop\\all.csv"    phonelist = getPhoneList(filepath = phonenumberfile)    ht = AnalyseHtml(phonelist,pool,filepath="./outputdata.csv")    ht.postToGetData()    """    t = countData(path="./enddata.csv")    p = []  #省份    y = []  #运营商    c = []  #city    pp = []    yy = []    cc = []    for x in t:        p.append(x[1])        c.append(x[2])        y.append(x[3])    pset = list(set(p))    cset = list(set(c))    yset = list(set(y))    #print len(pset),len(cset),len(yset)    for x in pset:        pp.append([x,p.count(x)])    for y in cset:        cc.append([x,c.count(y)])    for z in yset:        yy.append([z,y.count(z)])    #print "here"    #print pp,cc,yyvalue = []attr =  []for x in pp:    value.append(x[1])    attr.append(x[0])print attrmap = Map("Map 结合 VisualMap 示例", width=1200, height=600)map.add("", attr, value, maptype='china', is_visualmap=True, visual_text_color='#000')map.render()

6) post请求

$cat fanyi.py +++++++++++++++++++fanyi.py+++++++++++++++++++++++++++++++++++++++++++++++#encoding:utf8import sysimport urllib2from lxml import etreeimport jsonimport urllibua_header =  {"User-Agent":        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"}#这个是有道的,为了查单个单词。这个最简单。def getWordText(word):    url = "http://www.youdao.com/w/eng/"+word+"/#keyfrom=dict2.index"    #   print "输入错误:\n如:\npython youdao 1 没有空格的单词\npython youdao 2 英文语句"    try:        request = urllib2.Request(url,headers=ua_header)        response = urllib2.urlopen(request)        htmldata = response.read()        #print "test:",htmldata        htmldata = etree.HTML(htmldata)        expression_xpath = "/html/body/div[1]/div[2]/div[1]/div[2]/div[2]/div[1]/div[2]/ul/li"        zh_text = htmldata.xpath(expression_xpath)        #print len(zh_text)        #print zh_text        for x in zh_text:            print x.text    except Exception as e:        print e#从百度翻译获取的,对句子进行翻译,也可以单词,但是单词解释就详细。def getSentenceText(s):    """        s:你要输入的英文句子    """    url = "http://fanyi.baidu.com/v2transapi/"    datadict = {"from":"en",                "to":"zh",                "query":None,                "simple_means_flag":'3',                }    datadict["query"] = s    formdata = urllib.urlencode(datadict)    try:        request = urllib2.Request(url,data=formdata,headers=ua_header)        response = urllib2.urlopen(request)        htmldata = response.read()        jsontext =  json.loads(htmldata)        result = jsontext["trans_result"]["data"][0]["dst"]        print result    except Exception as e:        print "error:",eif __name__=="__main__":    #getHtmlText(word="hello")    getSentenceText("hello,world")+++++++++++++++++++++++++++++输出如下++++++++++++++++++++++++$python fanyi.py 你好,世界

“`

原创粉丝点击