selenium和casperjs2种数据抓取方式(进来的朋友请留言,共同探讨)

来源:互联网 发布:加拿大炮王吴亦凡 知乎 编辑:程序博客网 时间:2024/04/30 16:00

今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、MP4的下载地址进行数据抓取

1、python+selenium

#!/usr/bin/python# -*- coding: utf-8 -*-from selenium import webdriverfrom bs4 import BeautifulSoupimport timeimport sysreload(sys)sys.setdefaultencoding('utf-8')def catchDate(s):    """页面数据提取"""    soup = BeautifulSoup(s)    z = []        m = soup.findAll("ul",class_="course-item-list-section-list")        for obj in m:        try:            print obj.previous_sibling.find('h3').get_text()            tmp = obj.findAll('li', class_="unviewed")            for eachli in tmp:                titleli = eachli.find('a').get_text()                print '          '+titleli                allaInEachDiv = eachli.find('div', class_="course-lecture-item-resource").findAll('a')                for eacha in allaInEachDiv:                    print '      '+eacha['href']        except Exception, e:            continue        if(tmp != ""):            z.append(tmp)    return zstarttime = time.time()driver = webdriver.PhantomJS(executable_path='C:\phantomjs-1.9.7-windows\phantomjs.exe')driver.get("https://class.coursera.org/nlp/lecture")html = driver.page_sourcecontent = catchDate(html)endtime = time.time()print endtime - starttimedriver.quit

2、casperjs

var casper = require("casper").create({clientScripts: ["jquery-1.7.js"],      stepTimeout: 120 * 1000,      pageSettings: {          loadImages: false      },      verbose: true,      logLevel: "error"  });  var numberOfLinks = 0;var fs = require('fs');var filename = 'content.txt';var fullContent = "";var startTime = new Date(), endTime;  casper.start("https://class.coursera.org/nlp/lecture", function() {    numberOfLinks = this.evaluate(function() {        return __utils__.findAll('.course-item-list-section-list').length;    });    this.echo(numberOfLinks + " items found");});getStartTime = function(){this.echo(startTime);this.then(getcontent);};getcontent = function() {    fullContent = this.evaluate(function() {        var content = "";        jQuery('.course-item-list-section-list').each(function() {var btitle = $(this).prev().find("h3").text();content += btitle + '\r\n';$(this).find("li").each(function(){var stitle = $(this).find("a").first().text();content += stitle + '\r';$(this).find("div a").each(function(){content += $(this).attr("href")+'\r';});content += '\r\n';});content += '\r\n\r\n';        });        return content;    });this.then(writefile);};writefile = function() {    this.echo('writing to ' + filename);    fs.write(filename, fullContent, 'w');this.then(getEndTime);};getEndTime = function(){endTime = new Date();}casper.then(getStartTime);casper.then(function exitSystem() {this.echo(new Date() - startTime);    casper.exit();  }); casper.run();

因为不熟练,感觉写的不太好,求大神对方法进行指导!!!


参考:

https://gist.github.com/imjared/5201405

http://casperjs.readthedocs.org/en/latest/modules/casper.html#evaluate

http://blog.csdn.net/u012577500/article/details/18185399

http://stackoverflow.com/questions/14894311/casperjs-windows-installation-how-is-it-done-the-correct-way-please

http://blog.csdn.net/sagomilk/article/details/20800543

0 0