Python爬取OJ提交过的代码

来源:互联网 发布:为什么淘宝评论删不掉 编辑:程序博客网 时间:2024/04/28 04:42

把之前在学校OJ提交过得正确代码保存到本地,做一下备份。主要用到了requests和bs,还是挺好玩的hhh。


import requestsfrom bs4 import BeautifulSoupimport reimport osclass Code(object):"""docstring for Code"""#初始化常用信息def __init__(self):self.home_url = 'http://acm.sdut.edu.cn/onlinejudge2/index.php/Home'self.login_url = 'http://acm.sdut.edu.cn/onlinejudge2/index.php/Home/Login/login'self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}self.user = 'tooog'self.passw = '123123'def login(self,url):post_data ={'user_name':self.user,'password':self.passw}self.post(self.login_url,post_data)def post(self,url,post_data):session.post(url,post_data,headers = self.headers)self.check_login()#检查登陆状态def check_login(self):html = self.session_get(self.home_url)Soup = BeautifulSoup(html.text,'html5lib')if(Soup.find(text = 'Logout') != None):print('登陆成功')info_url = 'http://acm.sdut.edu.cn' + Soup.find('ul',class_ = 'navbar-right').find('a')['href']self.info(info_url)else:print('登陆失败')exit()#个人信息页def info(self,url):info_html = self.session_get(url)sub_href = re.search('href="(.*?)">Submissions',info_html.text).group(1)sub_url = self.home_url + sub_hrefself.submissions(sub_url)#代码提交记录页def submissions(self,url):url = 'http://acm.sdut.edu.cn/onlinejudge2/index.php/Solution/status/username/' + self.user + '/result/1/p/1.html'self.page(url)#翻页处理def page(self,url):sub_html = self.session_get(url)page_num = BeautifulSoup(sub_html.text,'html5lib').find_all('a',class_ = 'num')[-1].get_text()for pg in range(1,int(page_num)+1):page_url = url[:-6] + str(pg) +'.html'self.find_code(page_url)def find_code(self,url):all_code = self.session_get(url)tr_list = BeautifulSoup(all_code.text,'html5lib').find('tbody').find_all('tr')for tr in tr_list:td = tr.find_all('td')code_id = td[2].textcode_href = td[6].a['href']code_url = 'http://acm.sdut.edu.cn' + code_hrefself.save(code_url,code_id)def save(self,url,code_id):code_html = self.session_get(url)code = BeautifulSoup(code_html.text,'html5lib').find('pre',class_ = 'brush:cpp;').get_text()f = open(code_id+'.cpp','w')f.write(code)f.close()def session_get(self,url):content = session.get(url,headers = self.headers)return content#cookie处理session = requests.Session()if __name__ == '__main__':code = Code()code.login('http://acm.sdut.edu.cn/onlinejudge2/index.php/Home/Login/login')