python 模拟浏览器

来源:互联网 发布:淘宝客丢单如何找回 编辑:程序博客网 时间:2024/05/18 03:44

想用python模拟浏览器访问web的方法测试些东西,有哪几种方法呢?

一类:单纯的访问web,不解析其js,css等。

1. urllib2

#-*- coding:utf-8 -*import urllib2def Furllib2(ip,port,url,timeout):    proxydict = {}    proxydict['http'] = "http://%s:%s"%(ip,port)    print proxydict    proxy_handler = urllib2.ProxyHandler(proxydict)    opener = urllib2.build_opener(proxy_handler)    opener.addheaders = [('User-agent', 'Mozilla/5.0')]    urllib2.install_opener(opener)    try:        response = urllib2.urlopen(url,timeout=timeout)        print response.geturl()        print response.getcode()        print response.info()        print response.read()        return True    except:        print 'some errors occored' + '-'*50        return 0def main():    proxyip = '14.18.16.69'    proxyport = '80'    proxy = 'http://2.181.1.127:80'    url = 'http://www.cnblogs.com/'    timeout = 4    print Furllib2(proxyip,proxyport,url,timeout)if __name__ == "__main__":    main()

 2. mechanize(与网站的自动化交互)

http://wwwsearch.sourceforge.net/mechanize/doc.html

def Fmechanize(url):    cookies = mechanize.CookieJar()    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))    try:        r = opener.open(url)  # GET        # r = opener.open("http://example.com/", data)  # POST        print r.geturl()        print r.info()        return True    except:        return 0

 二类:模拟浏览器,使用firefox等的浏览器引擎,支持js,css等。

1. selenium 的firefox或者chrome等驱动,但是由于要打开一个浏览器,所以会比较慢(浏览器驱动可以到selenium官网上下载,也可以到firefox插件出搜索)

def Fselenium_firefox(ip,port,url,timeout):    try:        profile = webdriver.FirefoxProfile()        profile.set_preference('network.proxy.type', 1)        profile.set_preference('network.proxy.http',ip)        profile.set_preference('network.proxy.http_port', port)        profile.update_preferences()        driver = webdriver.Firefox(profile,timeout = timeout)    except Exception:        print traceback.print_exc()        return 0        pass    try:        driver.get(url)        time.sleep(5)        cookies= driver.get_cookies()        print cookies        # driver.get()        driver.quit()        return 1    except Exception:        traceback.print_exc()        # print 'not have Union allianceid'        driver.quit()        return 0

 2. selenium :headless test使用selenium+ phantomjs驱动,无需打开浏览器,但是支持js的模拟浏览器动作,也就说说和你手工打开是没有区别的。

http://selenium.googlecode.com/git/docs/api/py/api.html

def Fselenium_phantomjs(ip,port,url,timeout):    try:        proxyip = '%s%s%s%s'%('--proxy=',ip,':',port)        proxyport = '--proxy-type=http'        service_args = []        service_args.append(proxyip)        service_args.append(proxyport)        print service_args        driver = webdriver.PhantomJS(service_args = service_args)
   #driver = webdriver.PhantomJS("/root/phantomjs-1.9.7-linux-x86_64/bin/phantomjs",service_args = service_args)制定phantomjs的位置 driver.set_page_load_timeout(timeout) driver.get(url) time.sleep(4) except Exception: traceback.print_exc() try: geturl = driver.current_url print driver.current_url return True except Exception: traceback.print_exc() geturl = None return 0

 3. qt,网上戗来的代码

http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

from PyQt4 import QtCore, QtGui, QtWebKit, QtNetworkclass cookieJar(QtNetwork.QNetworkCookieJar):    def __init__(self, cookiesKey, parent=None):        super(cookieJar, self).__init__(parent)        self.mainWindow = parent        self.cookiesKey = cookiesKey        cookiesValue    = self.mainWindow.settings.value(self.cookiesKey)               if cookiesValue:            cookiesList = QtNetwork.QNetworkCookie.parseCookies(cookiesValue)            self.setAllCookies(cookiesList)   # def setCookiesFromUrl (self, cookieList, url):    #    cookiesValue = self.mainWindow.settings.value(self.cookiesKey)     #   cookiesArray = cookiesValue if cookiesValue else QtCore.QByteArray()      #  for cookie in cookieList:       #     cookiesArray.append(cookie.toRawForm() + "\n")        #self.mainWindow.settings.setValue(self.cookiesKey, cookiesArray)        #return super(cookieJar, self).setCookiesFromUrl(cookieList, url)    def deleteCookie(self,cookieList):cookie = []self.mainWindow.settings.value(cookie)class webView(QtWebKit.QWebView):    def __init__(self, cookiesKey, url, parent=None):        super(webView, self).__init__(parent)        self.cookieJar = cookieJar(cookiesKey, parent)        self.page().networkAccessManager().setCookieJar(self.cookieJar)class myWindow(QtGui.QMainWindow):    def __init__(self, parent=None):        super(myWindow, self).__init__(parent)        self.cookiesKey = "cookies"        self.centralwidget = QtGui.QWidget(self)        self.tabWidget = QtGui.QTabWidget(self.centralwidget)        self.tabWidget.setTabsClosable(True)        self.verticalLayout = QtGui.QVBoxLayout(self.centralwidget)        self.verticalLayout.addWidget(self.tabWidget)        self.actionTabAdd = QtGui.QAction(self)        self.actionTabAdd.setText("Add Tab")        self.actionTabAdd.triggered.connect(self.on_actionTabAdd_triggered)        self.lineEdit = QtGui.QLineEdit(self)        self.lineEdit.setText("http://www.example.com")        self.toolBar = QtGui.QToolBar(self)        self.toolBar.addAction(self.actionTabAdd)        self.toolBar.addWidget(self.lineEdit)        self.addToolBar(QtCore.Qt.ToolBarArea(QtCore.Qt.TopToolBarArea), self.toolBar)        self.setCentralWidget(self.tabWidget)        self.settings = QtCore.QSettings()    @QtCore.pyqtSlot()    def on_actionShowCookies_triggered(self):        webView = self.tabWidget.currentWidget()        listCookies = webView.page().networkAccessManager().cookieJar().allCookies()        for cookie in  listCookies:            print cookie.toRawForm()    @QtCore.pyqtSlot()    def on_actionTabAdd_triggered(self):        url = self.lineEdit.text()        self.addNewTab(url if url else 'about:blank')    def addNewTab(self, url):        tabName = u"Tab {0}".format(str(self.tabWidget.count()))        tabWidget= webView(self.cookiesKey, url, self)        tabWidget.loadFinished.connect(self.on_tabWidget_loadFinished)        tabWidget.load(QtCore.QUrl(url))        tabIndex = self.tabWidget.addTab(tabWidget, tabName)        self.tabWidget.setCurrentIndex(tabIndex)    @QtCore.pyqtSlot()    def on_tabWidget_loadFinished(self):        cookies2 = self.settings.value(self.cookiesKey)if __name__ == "__main__":    import sys    app = QtGui.QApplication(sys.argv)    app.setApplicationName('myWindow')    main = myWindow()    main.resize(666, 333)    main.show()    sys.exit(app.exec_())

 

4. qt-headless

http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

import sys  from PyQt4.QtGui import *  from PyQt4.QtCore import *  from PyQt4.QtWebKit import *    class Render(QWebPage):    def __init__(self, url):      self.app = QApplication(sys.argv)      QWebPage.__init__(self)      self.loadFinished.connect(self._loadFinished)      self.mainFrame().load(QUrl(url))      self.app.exec_()      def _loadFinished(self, result):      self.frame = self.mainFrame()      self.app.quit()    url = 'http://webscraping.com'  r = Render(url)  html = r.frame.toHtml()  print html

 5. splinter :打开浏览器,模拟操作,python的

http://splinter.cobrateam.info/docs/tutorial.html

>>> from splinter import Browser>>> browser = Browser()>>> url = "http://www.cnblogs.com">>> browser.visit(url)

 

 

 

具体用哪个要看你有什么具体的需求了

0 0
原创粉丝点击