存档

来源:互联网 发布:java 加断点快捷键 编辑:程序博客网 时间:2024/05/06 08:37
# -*- coding: utf-8 -*-import urllib2,cookielibimport urllibimport cStringIOimport datetimefrom PIL import Imagefrom lxml import etreeimport sysreload(sys)sys.setdefaultencoding('utf8')def setOpener():    cookie = cookielib.CookieJar()    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))    opener.addheaders.append(('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0'))    return openerdef md5(str):    import hashlib    import types    if type(str) is types.StringType:        m = hashlib.md5()        m.update(str)        return m.hexdigest()    else:        return ''class spider:    def __init__(self):        self.opener=setOpener()#保存cookie信息        self.imgUrl='http://210.42.121.241/servlet/GenImg'        self.loginUrl='http://210.42.121.241/servlet/Login'        self.queryScoreUrl='http://210.42.121.241/servlet/Svlt_QueryStuScore'        self.studentID=''        self.password=''        self.captcha=''#验证码        self.mainPageContent=''    def getCaptcha(self):        res =self.opener.open(urllib2.Request(self.imgUrl))        tempIm = cStringIO.StringIO(res.read())        im = Image.open(tempIm)        return im        #im.save('test.jpg')        #im.show()        #self.captcha = raw_input("验证码:")    def loginMainPage(self):        #需要post的数据        pwdMD5=md5(self.password)        postdata = urllib.urlencode({            'id':self.studentID,            'pwd':pwdMD5,            'xdvfb':self.captcha            })        req = urllib2.Request(            url = self.loginUrl,            data = postdata            )        response = self.opener.open(req)        self.mainPageContent = response.read().decode('gb2312')    def getAndSaveScore(self):         page=etree.HTML(self.mainPageContent)         text=page.xpath('//div[@id="school"]/@onclick')         try:            token=text[0][65:101]         except IndexError:            print "Error:未能正确打开主页面"            return 0         else:          GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'          GMT_time=datetime.datetime.utcnow().strftime(GMT_FORMAT)          getParams=urllib.urlencode({              'csrftoken':token,              'learnType':'',              'scoreFlag':'0',              't':GMT_time,              'term':'',              'year':'0'          })          url = self.queryScoreUrl          fullUrl=url+'?'+getParams          #print fullUrl          req = urllib2.Request(fullUrl)          response = self.opener.open(req)          result = response.read().decode('gb2312')          # 由于该网页是gb2312的编码,所以需要解码          #print result          out=open('inputScore.html','wb')          out.write(result)          out.close()          return 1#mySpider=spider()#mySpider.getCaptcha()#mySpider.loginMainPage()#mySpider.getAndSaveScore()
0 0