python爬虫

来源：互联网发布：淘宝装修免费一键安装编辑：程序博客网时间：2024/05/17 07:16

这两天学了下python爬虫，第一个目标就是模拟登录一个网站(选择了这个博客)，现在终于成功了^_^，纪念一下～

步骤：

要登录首先得找到网址传输数据，除了一般的账户和密码之外还有一些隐藏的参数，需要用浏览器的F12调试。先手动用浏览器登录，查看Form表单的数据，一般看网络中文件参数的第一个，可以看到传输的所有参数。一般网站会有一个值为随机码的参数，这个就要用正则表达式取获取。获取完后就行了，但是我依然登录不了。。我输出获得的随机码以及网页，从网页中发现随机码与获取的不一样。。想想很明显，因为我post两次，肯定不一样啊，，第一次获取随机码，第二次将数据传输进去进行登录。。我在想要怎么办，但发现并没有好方法，所以我想应该是哪个步骤有问题，于是我问问方丈同学，他告诉我第二次是将传输数据传入到登录后的url中==，想想也是，get。要登录的话头部也是要加的，一开始我没加被服务器拒绝访问了。。头部要有请求的身份。有的网站有限制引用页的方法反盗链，这个可以在头部中加个源链接Referer就行了。

lt = re.findall('(?<=name="lt" value=")[^"]+(?=")')

这个正则要注意类似这种 [^"] 中间内容不等于"，假设中间有"这个字符的话就匹配不好了。

# -*- coding: utf-8 -*-import urllib2import urllibimport reimport cookieliburl = "https://passport.csdn.net/account/login?ref=toolbar"tmpres = urllib2.urlopen(url)tmpfile = tmpres.read()#print tmpfile#pattern = re.compile('(?<=name="lt" value=")[^"]+(?=")')lt = re.findall('(?<=name="lt" value=")[^"]+(?=")',tmpfile)execution = re.findall('(?<=name="execution" value=")[^"]+(?=")',tmpfile)values = {        "username":"*****@qq.com",        "password":"******",        "lt":lt[0],        "execution":execution[0],        "_eventId":"submit"        }#print lt[0],execution[0]user_agent = 'Mozilla/5.0' headers = {        'User-Agent':user_agent,        'Referer':'https://passport.csdn.net/account/login?ref=toolbar'        }data = urllib.urlencode(values) #将格式转换一下request = urllib2.Request("http://blog.csdn.net/jxust_tj",data,headers)response = urllib2.urlopen(request)print response.read()#print len(re.read())#print response.getcode()

上面这个我以为登录了，但实际上连编辑、删除字眼都没看到，所以是没有登录成功的==

尝试了很久也没有成功，不知为何，所以只能用最强的一招了，直接用浏览器登录，然后将cookie复制下来登录就行了==

# -*- coding: utf-8 -*-import urllib2import urllibimport reimport requestsimport cookieliblogin_url = "https://passport.csdn.net/account/login?ref=toolbar"main_url = "http://blog.csdn.net/jxust_tj"user_agent = 'Mozilla/5.0' headers = {        'User-Agent':user_agent,        'Cookie':'uuid_tt_dd=10779..........',        'Upgrade-Insecure-Requests':'1',        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',        'Accept-Encoding':'deflate, sdch',        'Accept-Language':'en-US,en;q=0.8',        'Connection':'keep-alive',        'Host':'blog.csdn.net'                }'''s = requests.Session()s.post(login_url,data=values,headers=headers)r = s.get(main_url)print r.text'''#filename = 'cookie.txt'cookie = cookielib.CookieJar()#cookie = cookielib.MozillaCookieJar()#cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)handler = urllib2.HTTPCookieProcessor(cookie)opener = urllib2.build_opener(handler)#cookie.save(ignore_discard=True, ignore_expires=True)request = urllib2.Request(main_url,headers=headers)response = opener.open(request)print response.read()#print r2.read()#print len(re.read())#print response.getcode()

5.17

今天想抓下这个博客上的代码，然后写一个正则表达式写了整整一个晚上，一直调试一直调试。。。真的要哭了。。

-----------

正则表达部分：

注意findall返回的是list类型，还有内容有括号()的要\转义，否则会把()当一组

问号?是非贪婪模式，也就是最短匹配，匹配到就好，以免后面又出现相应字符时匹配的不是自己想要的结果

例1：xxx,,,yyy 匹配xxx和yyy之间的内容 re.findall('xxx(.*?)yyy', text)

例2：假如例1中xxx和yyy分别为一行，中间内容为多行，那上面就不行了，因为 . 匹配除换行符。所以应用dotall模式(包括换行符): re.findall('xxx(.(?s)*?)yyy', text)

-----------

#-*- coding: utf-8 -*-import requestsimport reimport sysreload(sys)sys.setdefaultencoding( "utf-8" ) url = "http://blog.csdn.net/jxust_tj/article/details/27522233"user_agent = 'Mozilla/5.0' headers = {        'User-Agent':user_agent        }# 使用requests库 r = requests.get(url,headers=headers)tmpfile = r.text# 正则表达式以<pre name="code" class="cpp">为前缀，"</pre>为后缀，中间那个*?造成非贪婪只匹配成功一次"code = re.findall('(?<=<pre name="code" class="cpp">)[\w\W]*?(?=</pre>)',tmpfile)# re.sub(pattern, repl, string)使用repl替换string中每一个匹配的子串(pattern)后返回替换后的字符串。r1 = re.sub('<','<',code[0])r2 = re.sub('>','>',r1)r3 = re.sub(''',"'",r2) #此处前一个字符被这代码格式转义了，是html中的符号r4 = re.sub('"','"',r3)r5 = re.sub('&','&',r4)print r5

8.21 爬取某网站json数据

利用多进程加快速度。我还得知的一些优化速度的技巧，比如dns cache、异步IO、epoll模型、分布式等。

一开始我的程序跑的很慢，因为一些发送的请求很慢才得到相应，这些请求多半有问题应该舍去，所以后来我用timeout设置了响应时间就快多了。

（base64.encodestring将字符串编码后得到的字符串竟然多一个换行，这里让我找了好几天的错误。）

# -*- coding: utf-8 -*-import urllib2import urllibimport cookielibimport requestsimport jsonimport osimport timeimport base64import sysfrom multiprocessing import Processreload(sys)sys.setdefaultencoding( "utf-8" )login_url = "http://"main_url = "http://"json_url = "http://"user_agent = "Mozilla/5.0"headers = {        'User-Agent':user_agent        }username = ""password = ""cookie = cookielib.CookieJar()handler = urllib2.HTTPCookieProcessor(cookie)opener = urllib2.build_opener(handler)#s = requests.Session()f1 = open('./fen11.txt','a')f2 = open('./fen12.txt','a')f3 = open('./fen13.txt','a')f4 = open('./fen14.txt','a')f5 = open('./fen15.txt','a')f6 = open('./fen16.txt','a')f7 = open('./fen17.txt','a')f8 = open('./fen18.txt','a')f9 = open('./fen19.txt','a')f10 = open('./fen20.txt','a')f = [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10]def login(username,password):    values = {            "username":username,            "password":password,            "rememberMe":"1"            }    data = urllib.urlencode(values)    request = urllib2.Request(login_url,data,headers)    response = opener.open(request)    print response.getcode()    print response.read()def GetJson(ID):    values = {            'id':ID,            '_random':'0.5934738078412436'            }    data = urllib.urlencode(values)    search_url = 'http://' + base64.encodestring(ID)    search_url = search_url.replace('\n','')    header = {            'Host': '...',            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0',            'Accept': 'application/json, text/javascript, */*; q=0.01',            'Accept-Language': 'en-US,en;q=0.5',            'Accept-Encoding': 'gzip, deflate',            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',            'X-Requested-With': 'XMLHttpRequest',            'Referer': search_url,            'Content-Length': '39',            'Cookie': '...',            'Connection': 'keep-alive',            'Cache-Control': 'max-age=0'            }    proxies = {            'http': 'http://211.167.112.14:80',            'https': 'http://211.167.112.14:80'            }   # request = urllib2.Request(json_url,headers=header)   # response = opener.open(request)    # print response.getcode()    #if response.getcode() != 200 and response.getcode() != 302:     #   print "GetJson fail"  #  print response.read()   # r1 = urllib2.Request(json_url,data,header)   # r2 = opener.open(r1,data=data)   # print r2.read()    try:        r1 = requests.post(json_url,data=values,headers=header,timeout=1)    except:        return None   # print "json_status_code:"+"%d"% r1.status_code    return r1.text    def GetResume(ID, fid):    jsonData = GetJson(ID)    if jsonData == None:        return False    value = json.loads(jsonData)    #print type(value['originalFilePath'].encode("utf-8"))    if value.has_key('originalFilePath'):        s = value['originalFilePath']        if s != "nullnull":            print >> f[fid-1], ID,s            return True    return Falsedef Loop(begin, end, fid):   # start_time = time.clock()    #login(username,password)    id = 100001000000    num = 1    for i in range(id+begin,id+end):        try:            if GetResume('%d'%i, fid) == True:                num = num+1            #time.sleep(0.01)        except urllib2.HTTPError, e:            print e.code        except urllib2.URLError, e:            print e.reason    f[fid-1].close()   # end_time = time.clock()   # print "%f s" %(end_time-start_time)if __name__ == '__main__':   # Loop(2302734,2400000,4)    p1 = Process(target=Loop, args=(2010067,2100000,1,))    p2 = Process(target=Loop, args=(2101840,2200000,2,))    p3 = Process(target=Loop, args=(2235917,2300000,3,))    p4 = Process(target=Loop, args=(2302734,2400000,4,))    p5 = Process(target=Loop, args=(2410282,2500000,5,))    p6 = Process(target=Loop, args=(2532273,2600000,6,))    p7 = Process(target=Loop, args=(2626960,2700000,7,))    p8 = Process(target=Loop, args=(2710324,2800000,8,))    p9 = Process(target=Loop, args=(2822153,2900000,9,))    p10 = Process(target=Loop, args=(2910190,3000000,10,))    p1.start()    p2.start()    p3.start()    p4.start()    p5.start()    p6.start()    p7.start()    p8.start()    p9.start()    p10.start()

0 0