python爬虫爬保研论坛

来源:互联网 发布:淘宝一键开店 编辑:程序博客网 时间:2024/05/24 05:01
#EE论坛爬虫
#by JerryFang#2013.11.13import reimport urllib2import urllibimport cookielibdef visitpage(url): nr = urllib2.urlopen(url).read().decode('gbk') print nr z = re.compile('''<a href=.*?class="xi2">(.*?)</a>''', re.S|re.MULTILINE) t = re.compile('''<em id=".*?">(.*?)</em>''', re.S|re.MULTILINE) c = re.compile(ur'''<table.*?>(.*?)</table>''', re.DOTALL|re.MULTILINE)## author = z.findall(nr)## time = t.findall(nr) cont = c.findall(nr)## print author## print time print cont raw_input('press any key')loginurl = 'http://www.eeban.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1'# cookiecj = cookielib.LWPCookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))urllib2.install_opener(opener) # Login登录POST包中获得user_data = {'username':'nku-ada', 'password':'1986a7da84fc50b4c0140b1124b968d5', 'quickforward':'yes', 'handlekey':'ls' }url_data = urllib.urlencode(user_data)req = urllib2.Request( url = loginurl, data = url_data )#登录动作res = opener.open(req)print 'Login Done!'#打开res = opener.open('http://www.eeban.com/forum.php?mod=forumdisplay&fid=137')print 'Prepare load content'cont = res.read().decode('gbk')print 'Load content'#找内容b = re.compile(ur'''<th.*?>(.+?)</th>''', re.DOTALL|re.MULTILINE)c = re.compile(ur'''<em>.*?</em>.*?<a href="(.*?)".*?>(.*?)</a>''', re.DOTALL|re.MULTILINE)res = b.findall(cont)for i in res: subres = c.findall(i) for j in subres: print j[1] visitpage(j[0])


这里有个问题,我在趴一个标题连接内容的时候,论坛是不给我显示的。