爬人人好友

来源:互联网 发布:数据恢复精灵破解 编辑:程序博客网 时间:2024/04/30 11:56
昨天脑子抽到想要爬人人好友,,,,,,发现只能爬2层 我的好友 和好友的好友。  本来还想搞一下最近访问的,但是模板太多了,不同好友的html可能不一样,而且抓的id有很多重复,再想办法解决。但是要期末考试了,所以先搁置一段时间吧!
from BeautifulSoup import BeautifulSoup as bp import urllib import urllib2 import cookielib import re fp=open('rr.txt','w')   def login(username, password):     """log in and return uid"""     logpage = "http://www.renren.com/ajaxLogin/login"     data = {'email': username, 'password': password}     login_data = urllib.urlencode(data)     cj = cookielib.CookieJar()     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))     urllib2.install_opener(opener)     res = opener.open(logpage, login_data)     #print "Login now ..."     html = res.read()     #print html      # Get uid     print "Getting user id of you now"     res = urllib2.urlopen("http://www.renren.com/home")     html = res.read()     #print html     uid = re.search("'ruid':'(\d+)'", html).group(1)     print uid     print "Login and got uid successfully"     return uid 
login(username, password)
    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid     html=urllib2.urlopen(url).read()     html=bp(html)     href=html.findAll('div',{'class':'page'})     try:         href=href[1].findChildren()[-1]['href']         href=str(href)         page=re.search(r"\d+",href).group(0)         #print page 

for i in range(int(page)+1):
    url='http://friend.renren.com/GetFriendList.do?curpage=%s&id=%s'%(i,uid)
    html=urllib2.urlopen(url).read()
    html=bp(html)
    #print html
    words=html.findAll('dd')
    for word in words:
        #print type(word),type(str(word)),'href' in word,'href' in str(word)
        if 'href' in str(word):
            name=word.a.string
            userid= word.a['href'][36:45]
            print name,userid ;fp.write(name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
        else:
            try:
                adress = word.string
                print adress;fp.write(adress.encode('utf-8')+'\n')
            except:
                print 'this one have no adress'
                fp.write('\n')
    print i,'is ok.....'
fp.close()

 
from BeautifulSoup import BeautifulSoup as bp
import urllib
import urllib2
import cookielib
import re
 
def login(username, password):
    """log in and return uid"""
    logpage = "http://www.renren.com/ajaxLogin/login"
    data = {'email': username, 'password': password}
    login_data = urllib.urlencode(data)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    res = opener.open(logpage, login_data)
    #print "Login now ..."
    html = res.read()
    #print html
 
    # Get uid
    print "Getting user id of you now"
    res = urllib2.urlopen("http://www.renren.com/home")
    html = res.read()
    #print html
    uid = re.search("'ruid':'(\d+)'", html).group(1)
    print uid
    print "Login and got uid successfully"
    return uid
 
 
print login(username, password) 
fp=open('rr.txt','r')
dic=open('ftf.txt','w+')
for line in fp.readlines():
     
    Mname=line.split('\t')[0]
    Muserid=line.split('\t')[1][36:45]
    #print len(id)
    #print userid
    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid
    html=urllib2.urlopen(url).read()
    html=bp(html)
    href=html.findAll('div',{'class':'page'})
    try:
        href=href[1].findChildren()[-1]['href']
        href=str(href)
        page=re.search(r"\d+",href).group(0)
        #print page
         
        for i in range(int(page)+1):
            urls=r'http://friend.renren.com/GetFriendList.do?curpage='+str(i)+r'&id='+str(Muserid)
            #print urls
            html=urllib2.urlopen(urls).read()
            html=bp(html)
            words=html.findAll('dd')
            #print len(words)
            for word in words:
                if 'href' in str(word):
                    name=word.a.string
                    userid= word.a['href']
                    #print Mname, name,userid ;
                    dic.write(Mname+'\t'+name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
                else:
                    try:
                        adress = word.string
                        #print adress;
                        dic.write(adress.encode('utf-8')+'\n')
                    except:
                        #print 'this one have no adress'
                        dic.write('\n')
    except:
        print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>worry'
        print i ,'is ok ...'
    print Mname ,'is ok >>>>>>>>>>>>>'
fp.close()
dic.close()



问题:

1、能不能不要每次都等入

2、最近好友有100人访问限制,需要验证码,求破。看样子是要学习一下urllib包了 里面应该有模拟浏览器的办法



登入的代码是借鉴其他人的 ,自己现在还不会,还是菜鸟~~~~~~~~~~~~~





最后我发了好几次 为什么插入代码的方式会出现html标签,,求解!