爬人人好友

来源：互联网发布：数据恢复精灵破解编辑：程序博客网时间：2024/04/30 11:56

昨天脑子抽到想要爬人人好友，，，，，，发现只能爬2层我的好友和好友的好友。本来还想搞一下最近访问的，但是模板太多了，不同好友的html可能不一样，而且抓的id有很多重复，再想办法解决。但是要期末考试了，所以先搁置一段时间吧!

from BeautifulSoup import BeautifulSoup as bp import urllib import urllib2 import cookielib import re fp=open('rr.txt','w')   def login(username, password):     """log in and return uid"""     logpage = "http://www.renren.com/ajaxLogin/login"     data = {'email': username, 'password': password}     login_data = urllib.urlencode(data)     cj = cookielib.CookieJar()     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))     urllib2.install_opener(opener)     res = opener.open(logpage, login_data)     #print "Login now ..."     html = res.read()     #print html      # Get uid     print "Getting user id of you now"     res = urllib2.urlopen("http://www.renren.com/home")     html = res.read()     #print html     uid = re.search("'ruid':'(\d+)'", html).group(1)     print uid     print "Login and got uid successfully"     return uid login(username, password)    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid     html=urllib2.urlopen(url).read()     html=bp(html)     href=html.findAll('div',{'class':'page'})     try:         href=href[1].findChildren()[-1]['href']         href=str(href)         page=re.search(r"\d+",href).group(0)         #print page 

for i in range(int(page)+1): 
    url='http://friend.renren.com/GetFriendList.do?curpage=%s&id=%s'%(i,uid) 
    html=urllib2.urlopen(url).read() 
    html=bp(html) 
    #print html 
    words=html.findAll('dd') 
    for word in words: 
        #print type(word),type(str(word)),'href' in word,'href' in str(word) 
        if 'href' in str(word): 
            name=word.a.string 
            userid= word.a['href'][36:45] 
            print name,userid ;fp.write(name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t') 
        else: 
            try: 
                adress = word.string 
                print adress;fp.write(adress.encode('utf-8')+'\n') 
            except: 
                print 'this one have no adress' 
                fp.write('\n') 
    print i,'is ok.....' 
fp.close()

 
from BeautifulSoup import BeautifulSoup as bp 
import urllib 
import urllib2 
import cookielib 
import re 
 
def login(username, password): 
    """log in and return uid""" 
    logpage = "http://www.renren.com/ajaxLogin/login" 
    data = {'email': username, 'password': password} 
    login_data = urllib.urlencode(data) 
    cj = cookielib.CookieJar() 
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 
    urllib2.install_opener(opener) 
    res = opener.open(logpage, login_data) 
    #print "Login now ..." 
    html = res.read() 
    #print html 
 
    # Get uid 
    print "Getting user id of you now" 
    res = urllib2.urlopen("http://www.renren.com/home") 
    html = res.read() 
    #print html 
    uid = re.search("'ruid':'(\d+)'", html).group(1) 
    print uid 
    print "Login and got uid successfully" 
    return uid 
 
 
print login(username, password) 
fp=open('rr.txt','r') 
dic=open('ftf.txt','w+') 
for line in fp.readlines(): 
     
    Mname=line.split('\t')[0] 
    Muserid=line.split('\t')[1][36:45] 
    #print len(id) 
    #print userid 
    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid 
    html=urllib2.urlopen(url).read() 
    html=bp(html) 
    href=html.findAll('div',{'class':'page'}) 
    try: 
        href=href[1].findChildren()[-1]['href'] 
        href=str(href) 
        page=re.search(r"\d+",href).group(0) 
        #print page 
         
        for i in range(int(page)+1): 
            urls=r'http://friend.renren.com/GetFriendList.do?curpage='+str(i)+r'&id='+str(Muserid) 
            #print urls 
            html=urllib2.urlopen(urls).read() 
            html=bp(html) 
            words=html.findAll('dd') 
            #print len(words) 
            for word in words: 
                if 'href' in str(word): 
                    name=word.a.string 
                    userid= word.a['href'] 
                    #print Mname, name,userid ; 
                    dic.write(Mname+'\t'+name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t') 
                else: 
                    try: 
                        adress = word.string 
                        #print adress; 
                        dic.write(adress.encode('utf-8')+'\n') 
                    except: 
                        #print 'this one have no adress' 
                        dic.write('\n') 
    except: 
        print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>worry' 
        print i ,'is ok ...' 
    print Mname ,'is ok >>>>>>>>>>>>>' 
fp.close() 
dic.close()

问题：

1、能不能不要每次都等入

2、最近好友有100人访问限制，需要验证码，求破。看样子是要学习一下urllib包了里面应该有模拟浏览器的办法

登入的代码是借鉴其他人的，自己现在还不会，还是菜鸟～～～～～～～～～～～～～

最后我发了好几次为什么插入代码的方式会出现html标签，，求解！