用python3实现HDU爬虫（后续可能更新VJ）2016.11.4更新

来源：互联网发布：淘宝上僵尸手办来源编辑：程序博客网时间：2024/05/17 02:52
主要问题：
1.爬到的信息有限
2.getstatus已经完成了，原因是页面没有重复获取，放到循环外面去了（Orz）
3.欢迎大佬们测试留言
import reimport requestsimport timeimport urllib.requestimport urllib.parsefrom bs4 import BeautifulSoupclass Acauto(object):    def __init__(self):        object.__init__(self)        self.session = requests.Session()        headers = {            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'        }        self.session.headers.update(headers)    def login(self, username, password):        url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'        data = {            'username': username,            'userpass': password,            'login': 'Sign In',        }        headers = {            'host': 'acm.hdu.edu.cn',            'origin': 'http://acm.hdu.edu.cn',            'referer': 'http://acm.hdu.edu.cn/'        }        r = self.session.post(url, data=data, headers=headers)    def getstatus(self, problemID): #新加功能        status_url = 'http://acm.hdu.edu.cn/status.php?user=zzuliauto2'        while True :            time.sleep(1)            req = self.session.get(status_url)            soup = BeautifulSoup(req.text,'lxml')            for i in soup.table.find_all('table')[-2].find_all('tr'):                ans = i.find_all('td')                if ans[3].string == str(problemID):                    dan = ans[2].string                    if (dan != 'Queuing' and dan != 'Compiling' and dan!='Running'):                        print (dan)                        return                    break    def submit(self, problemID, code, language=0):        url = 'http://acm.hdu.edu.cn/submit.php?action=submit'        code = code.encode('utf-8').decode()        data = {            'check': '0',            'problemid': str(problemID),            'language': str(language),            'usercode': code        }        headers = {            'Connect-Type': 'application/x-www-form-urlencoded'        }        print('submitting problem: ', problemID)        r = self.session.post(url, data=data, headers=headers)        c.getstatus(problemID)    def getsolved(self, username):        url = 'http://acm.hdu.edu.cn/userstatus.php?user=%s' % username        solved = []        r = self.session.get(url)        # 解析出含有所有已完成题目号的字符串solvedstr        soup = BeautifulSoup(r.text, 'html.parser')        result = soup.find('p', align='left')        solvedstr = result.text.split(';')        # 从solvedstr中解析出一个list，含有所有完成题目号码        for item in solvedstr:            if item:                item = re.search(r'\d{4}', item)    # 匹配4个数字                solved.append(item.group(0))        return solved    def getbaidu(self, problemID):        solutions = []        solutionurls = []        url = r'http://www.baidu.com/s?wd=hdu%20' + str(problemID)      # 用题号拼接url        baidusession = requests.Session()        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}        baidusession.headers.update(headers)        r = baidusession.get(url)        soup = BeautifulSoup(r.text, 'html.parser')        res = soup.find_all('a', attrs={'target': '_blank', 'class': 'c-showurl',                                  'style': 'text-decoration:none;'})        for item in res:            if re.match('blog.csdn.net', item.text):                solutionurls.append(item['href'])        for item in solutionurls:            r = baidusession.get(item)            soup = BeautifulSoup(r.text, 'html.parser')            code = soup.find(attrs={'name': 'code', 'class': 'cpp'})            if code:                # 先验证博客标题，如果标题包含题号，则继续                title = soup.find('span', class_='link_title')                if title == None:  #异常:至今不懂为何NoneType                    break                if title.text==None:#异常:至今不懂为何NoneType                    break                pos = (title.text).find(str(problemID))                if pos == -1:     # 若果不包含题号，break                    break                solutions.append(code.text)        print(problemID, 'solutions finded: ', len(solutions))        return solutions    def acrush(self, start=1000, end=5932, interval=10):        language = 0        for problemID in range(start, end):            # 判断这个题是否被我ac            if str(problemID) not in c.getsolved(user):                print(problemID, 'is not AC, start solving it...')                # 解决这道没有AC的题目                answers = c.getbaidu(problemID)                if answers:                    for answer in answers:                        if str(problemID) not in c.getsolved(user):                            # 判断是否为：C++                            if answer.find('iostream') != -1:                                language=2                            elif answer.find('cstdio') != -1:                                language=2                            elif answer.find('stdio.h') != -1:                                language=0                            else:                                print('language=???')                                continue                            print('language=', language)                            c.submit(problemID, answer, language=language)                            time.sleep(interval)                        else:                            breakif __name__ == '__main__':    c = Acauto()    user = 'zzuliauto2'    password = '19951106'    startID = 1010    c.login(user, password)    c.acrush(1010)
0 0