爬人人好友
来源:互联网 发布:数据恢复精灵破解 编辑:程序博客网 时间:2024/04/30 11:56
昨天脑子抽到想要爬人人好友,,,,,,发现只能爬2层 我的好友 和好友的好友。 本来还想搞一下最近访问的,但是模板太多了,不同好友的html可能不一样,而且抓的id有很多重复,再想办法解决。但是要期末考试了,所以先搁置一段时间吧!
from BeautifulSoup import BeautifulSoup as bp import urllib import urllib2 import cookielib import re fp=open('rr.txt','w') def login(username, password): """log in and return uid""" logpage = "http://www.renren.com/ajaxLogin/login" data = {'email': username, 'password': password} login_data = urllib.urlencode(data) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) res = opener.open(logpage, login_data) #print "Login now ..." html = res.read() #print html # Get uid print "Getting user id of you now" res = urllib2.urlopen("http://www.renren.com/home") html = res.read() #print html uid = re.search("'ruid':'(\d+)'", html).group(1) print uid print "Login and got uid successfully" return uidlogin(username, password)for i in range(int(page)+1):url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid html=urllib2.urlopen(url).read() html=bp(html) href=html.findAll('div',{'class':'page'}) try: href=href[1].findChildren()[-1]['href'] href=str(href) page=re.search(r"\d+",href).group(0) #print page
url='http://friend.renren.com/GetFriendList.do?curpage=%s&id=%s'%(i,uid)
html=urllib2.urlopen(url).read()
html=bp(html)
#print html
words=html.findAll('dd')
for word in words:
#print type(word),type(str(word)),'href' in word,'href' in str(word)
if 'href' in str(word):
name=word.a.string
userid= word.a['href'][36:45]
print name,userid ;fp.write(name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
else:
try:
adress = word.string
print adress;fp.write(adress.encode('utf-8')+'\n')
except:
print 'this one have no adress'
fp.write('\n')
print i,'is ok.....'
fp.close()
from BeautifulSoup import BeautifulSoup as bp
import urllib
import urllib2
import cookielib
import re
def login(username, password):
"""log in and return uid"""
logpage = "http://www.renren.com/ajaxLogin/login"
data = {'email': username, 'password': password}
login_data = urllib.urlencode(data)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
res = opener.open(logpage, login_data)
#print "Login now ..."
html = res.read()
#print html
# Get uid
print "Getting user id of you now"
res = urllib2.urlopen("http://www.renren.com/home")
html = res.read()
#print html
uid = re.search("'ruid':'(\d+)'", html).group(1)
print uid
print "Login and got uid successfully"
return uid
print login(username, password)
fp=open('rr.txt','r')
dic=open('ftf.txt','w+')
for line in fp.readlines():
Mname=line.split('\t')[0]
Muserid=line.split('\t')[1][36:45]
#print len(id)
#print userid
url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid
html=urllib2.urlopen(url).read()
html=bp(html)
href=html.findAll('div',{'class':'page'})
try:
href=href[1].findChildren()[-1]['href']
href=str(href)
page=re.search(r"\d+",href).group(0)
#print page
for i in range(int(page)+1):
urls=r'http://friend.renren.com/GetFriendList.do?curpage='+str(i)+r'&id='+str(Muserid)
#print urls
html=urllib2.urlopen(urls).read()
html=bp(html)
words=html.findAll('dd')
#print len(words)
for word in words:
if 'href' in str(word):
name=word.a.string
userid= word.a['href']
#print Mname, name,userid ;
dic.write(Mname+'\t'+name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
else:
try:
adress = word.string
#print adress;
dic.write(adress.encode('utf-8')+'\n')
except:
#print 'this one have no adress'
dic.write('\n')
except:
print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>worry'
print i ,'is ok ...'
print Mname ,'is ok >>>>>>>>>>>>>'
fp.close()
dic.close()
问题:
1、能不能不要每次都等入
2、最近好友有100人访问限制,需要验证码,求破。看样子是要学习一下urllib包了 里面应该有模拟浏览器的办法
登入的代码是借鉴其他人的 ,自己现在还不会,还是菜鸟~~~~~~~~~~~~~
最后我发了好几次 为什么插入代码的方式会出现html标签,,求解!
- 爬人人好友
- 人人好友生日提取
- python 爬虫爬取人人网你的好友的所有相册图片
- Python小练习:可视化人人好友关系
- 查看人人网非好友的状态
- 查看人人网非好友的状态
- PHP写的人人网好友爬虫
- 人人网查看非好友状态
- 获取人人网上好友通讯录同步到手机上
- python自动下载人人所有好友的相册
- 类似人人网 "新鲜事"(好友动态、SNS)架构:
- 用python写的人人网遍历好友脚本
- Python_自动登陆人人网 输出好友列表
- 人人网,获取好友Json代码的正则表达式
- 社会网络分析:探索人人网好友推荐系统
- 类似人人网 "新鲜事"(好友动态、SNS)架构
- httpclient登陆人人网,发表状态、日志,遍历访问所有好友、给好友留言
- 爬取人人贷
- poj 1003 && hdu 1056 HangOver
- 机器人挖矿问题
- PHP中spl_autoload_register函数的用法
- POJ 4046 Sightseeing 解题报告
- dwr session error 问题解决
- 爬人人好友
- 揭示同步块索引(上):从lock开始
- win7 上 linux mint14 (64bit) 双系统安装过程
- HDU 4291 A Short problem 矩阵,多重函数求循环节
- Arduino Leonardo新手入门体验
- 写给自己的话 二
- 将g++编译器集成到VC2005中
- PHP设计模式之:单例模式
- jQuery去掉指定标签里所有文字内容对应的链接,==去掉<a>标签