模拟登陆,抓取会员课程

来源:互联网 发布:js arguments用法 编辑:程序博客网 时间:2024/04/30 04:27

基于python,使用selenium驱动Phantomjs,使用会员账号密码,模拟登陆某课程网站,根据课程类别检索课程,找到课程文档,使用Phantomjs进行截图或者保存为pdf。


import requestsimport timeimport osfrom lxml import etreefrom selenium import webdriverimport seleniumfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysdriver=webdriver.PhantomJS()#使用selenium驱动PhantomJS无头浏览器driver.get('http://www._thetargetnet_.com/login')account=driver.find_element_by_name('login')account.send_keys('your_account')password=driver.find_element_by_name('password')password.send_keys('your_password')password.send_keys(Keys.RETURN)cookies={}raw_cookie='copy the cookies here after you login in'#cookie 需要登录之后手动保存for line in raw_cookie.split(';'):#把cookie转换成字典格式key,value=line.split('=',1)cookies[key]=valuecookies_new=requests.utils.cookiejar_from_dict(cookies,cookiejar=None,overwrite=True)#cookie从字典格式转换成cookiejar格式,才可以被session采用s=requests.Session()#使用session,使得若干次请求可以共享cookies.cookies=cookies_newdef download(driver,target_path):#定义函数,将页面保存为pdf格式(来自stackoverflow)def execute(script,args):driver.execute('executePhantomScript',{'script':script,'args':args})driver.command_executor._commands['executePhantomScript']=('POST','/session/$sessionId/phantom/execute')page_format='this.paperSize={format:"A4",orientation:"portrait"};'execute(page_format,[])render='''this.render("{}")'''.format(target_path)execute(render,[])def parse_class_page(tag,course_num):class_url='https://www.targetnet.com'+course_numclass_page=s.get(class_url)class_page=etree.HTML(class_page.text)course_cat=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[2]/text()')[0]if course_cat=="训练营":print('this is 训练营,需要专门购买')returncourse_name=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[1]/text()')[0]print('the course dealing with:{}'.format(course_name))path='./{}/{}'.format(tag,'free_'+course_name)isExist= os.path.exists(path)if not isExist:os.makedirs(path)try:document_num=class_page.xpath('//*[@id="reports"]/span[@class="lab-id"]/@data-lab-id')length=len(document_num)count=1print('there are {} document in this course'.format(length))for i in document_num:document_url=class_url+'/labs/'+i+'/document'driver.get(document_url)time.sleep(3)driver.save_screenshot('{}/{}of{}.png'.format(path,count,length))#保存为图片版本download(driver,'{}/{}of{}.pdf'.format(path,count,length))#保存为pdf版本print('{} page of this class is saved'.format(count))count+=1time.sleep(1)except:passdef class_per_page(tag,page):url_base='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page={}'#付费课程#url_base='https://www.target.com/courses/?category=all&course_type=all&fee=free&tag={}&page={}'#免费课程url_this_page=url_base.format(tag,page)this_page=s.get(url_this_page)this_page=etree.HTML(this_page.text)class_link=this_page.xpath('/html/body/div[3]/div/div/div[2]/div[3]/div/a/@href')print('dealing with page {} of {}:'.format(page,tag))for j in class_link:parse_class_page(tag,j)time.sleep(1)def all_class(tag):tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page=1'.format(tag)#付费#tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=free&tag={}&page=1'.format(tag)#免费print('page url of {} is {}'.format(tag,tag_url))tag_page=s.get(tag_url)tag_page=etree.HTML(tag_page.text)try:page_num=int(tag_page.xpath('//html/body/div[3]/div/div/div[2]/nav/ul/li/a/text()')[-3])except:page_num=1print('there are {} pages in {}'.format(page_num,tag))for i in range (1,page_num+1):i=str(i)class_per_page(tag,i)tag_list=[]while 1:tag=input('需要下载的课程类别(输入‘q’表示结束):')if tag=='q':breaktag_list.append(tag)for tag in tag_list:isExist=os.path.exists(tag)if not isExist:os.makedirs(tag)print('directory created')try:print('now downloading{}'.format(tag))all_class(tag)except:passdriver.quit()#最后一定记得手动关掉PhantomJS浏览器


阅读全文
0 0
原创粉丝点击