python爬虫爬保研论坛

来源：互联网发布：淘宝一键开店编辑：程序博客网时间：2024/05/24 05:01

#EE论坛爬虫
#by JerryFang#2013.11.13import reimport urllib2import urllibimport cookielibdef visitpage(url):    nr = urllib2.urlopen(url).read().decode('gbk')    print nr    z = re.compile('''<a href=.*?class="xi2">(.*?)</a>''', re.S|re.MULTILINE)    t = re.compile('''<em id=".*?">(.*?)</em>''', re.S|re.MULTILINE)    c = re.compile(ur'''<table.*?>(.*?)</table>''', re.DOTALL|re.MULTILINE)##    author = z.findall(nr)##    time = t.findall(nr)    cont = c.findall(nr)##    print author##    print time    print cont    raw_input('press any key')loginurl = 'http://www.eeban.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1'# cookiecj = cookielib.LWPCookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))urllib2.install_opener(opener)    # Login登录POST包中获得user_data = {'username':'nku-ada',             'password':'1986a7da84fc50b4c0140b1124b968d5',             'quickforward':'yes',             'handlekey':'ls'            }url_data = urllib.urlencode(user_data)req = urllib2.Request(                      url = loginurl,                      data = url_data                     )#登录动作res = opener.open(req)print 'Login Done!'#打开res = opener.open('http://www.eeban.com/forum.php?mod=forumdisplay&fid=137')print 'Prepare load content'cont = res.read().decode('gbk')print 'Load content'#找内容b = re.compile(ur'''<th.*?>(.+?)</th>''', re.DOTALL|re.MULTILINE)c = re.compile(ur'''<em>.*?</em>.*?<a href="(.*?)".*?>(.*?)</a>''', re.DOTALL|re.MULTILINE)res = b.findall(cont)for i in res:    subres = c.findall(i)    for j in subres:        print j[1]        visitpage(j[0])

这里有个问题，我在趴一个标题连接内容的时候，论坛是不给我显示的。