爬虫采集-基于webkit核心的客户端Ghost.py [爬虫实例]
来源:互联网 发布:米思米2015选型软件 编辑:程序博客网 时间:2024/06/04 23:33
1
2
3
4
from ghost
import
Ghost
ghost = Ghost()
page, extra_resources = ghost.open(
"http://xiaorui.cc"
)
assert page.http_status==
200
and
'xiaorui'
in
ghost.content
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
wget http:
//sourceforge.net/projects/pyqt/files/sip/sip-4.14.6/sip-4.14.6.tar.gz
tar zxvf sip-
4.14
.
6
.tar.gz
cd sip-
4.14
.
6
python configure.py
make
sudo make install
wget http:
//sourceforge.net/projects/pyqt/files/PyQt4/PyQt-4.10.1/PyQt-mac-gpl-4.10.1.tar.gz
tar zxvf PyQt-mac-gpl-
4.10
.
1
.tar.gz
cd PyQt-mac-gpl-
4.10
.
1
python configure.py
make
sudo make install
wget http:
//pyside.markus-ullmann.de/pyside-1.1.1-qt48-py27apple.pkg
open pyside-
1.1
.
1
-qt48-py27apple.pkg
git clone https:
//github.com/mitsuhiko/flask.git
cd flask
sudo python setup.py install
git clone git:
//github.com/carrerasrodrigo/Ghost.py.git
cd Ghost.py
sudo python setup.py install
1
2
from ghost
import
Ghost
ghost = Ghost()
1
page, resources = ghost.open(
'http://my.web.page'
)
1
2
result, resources = ghost.evaluate(
"document.getElementById('my-input').getAttribute('value');"
)
1
2
page, resources = ghost.evaluate(
"document.getElementById('link').click();"
, expect_loading=True)
1
result, resources = ghost.set_field_value(
"input[name=username]"
,
"jeanphix"
)
1
2
3
4
result, resources = ghost.fill(
"form"
, {
"username"
:
"jeanphix"
,
"password"
:
"mypassword"
})
1
page, resources = ghost.fire_on(
"form"
,
"submit"
, expect_loading=True)
wait_for_page_loaded()
wait_for_selector(selector)
wait_for_text(text)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import
unittest
from
flask
import
Flask
from
ghost
import
GhostTestCase
app
=
Flask(__name__)
@app
.route(
'/'
)
def
home():
return
'hello world'
class
MyTest(GhostTestCase):
port
=
5000
@classmethod
def
create_app(
cls
):
return
app
def
test_open_home(
self
):
self
.ghost.
open
(
"http://localhost:%s/"
%
self
.port)
self
.assertEqual(
self
.ghost.content,
'hello world'
)
if
__name__
=
=
'__main__'
:
unittest.main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Opens the web page
ghost.open(
'http://www.openstreetmap.org/'
)
# Waits
for
form search field
ghost.wait_for_selector(
'input[name=query]'
)
# Fills the form
ghost.fill(
"#search_form"
, {
'query'
:
'France'
})
# Submits the form
ghost.fire_on(
"#search_form"
,
"submit"
)
# Waits
for
results (an XHR has been called here)
ghost.wait_for_selector(
'#search_osm_nominatim .search_results_entry a'
)
# Clicks first result link
ghost.click(
'#search_osm_nominatim .search_results_entry:first-child a'
)
# Checks
if
map has moved to expected latitude
lat, resources = ghost.evaluate(
"map.center.lat"
)
assert float(lat.toString()) ==
5860090.806537
1
2
In [
10
]: print page.headers
{u
'BDQID'
: u
'0xf594a31a03344b4f'
, u
'Content-Encoding'
: u
'gzip'
, u
'Set-Cookie'
: u
'BDSVRTM=381; path=/\nH_PS_PSSID=2976_2981_3091; path=/; domain=.baidu.com'
, u
'BDUSERID'
: u
'0'
, u
'Server'
: u
'BWS/1.0'
, u
'Connection'
: u
'Keep-Alive'
, u
'Cache-Control'
: u
'private'
, u
'Date'
: u
'Tue, 03 Sep 2013 09:53:56 GMT'
, u
'Content-Type'
: u
'text/html;charset=utf-8'
, u
'BDPAGETYPE'
: u
'3'
}
0 0
- 爬虫采集-基于webkit核心的客户端Ghost.py [爬虫实例]
- 爬虫采集-基于webkit核心的客户端Ghost.py [爬虫实例]
- 基于WebKit的网络爬虫
- 基于webkit技术的爬虫
- 基于WebKit的网络爬虫
- SuperSpider 基于webkit的web2.0爬虫介绍
- ghost.py 使用实例
- 基于DevTools协议+Chromium headless的客户端爬虫框架
- 基于python的爬虫
- 一个简单的爬虫douban_list_spider.py
- 基于Python、PyQuery实现的一个网络爬虫实例
- PY爬虫Demo集合
- PY爬虫开发利器
- PY 爬虫 Urllib2
- 网络爬虫的采集,处理,存储
- Python爬虫采集CloudBlog网站的文章
- QueryList是一套简洁、优雅的PHP采集工具(爬虫),基于phpQuery。
- 一个信息采集器(小爬虫)的实例和优化
- Swift iOS实现把PCM语音转成MP3格式
- springMVC监听器
- HTML 命名规范
- mysql update
- Markdown 语法的简要规则
- 爬虫采集-基于webkit核心的客户端Ghost.py [爬虫实例]
- 自定义圆形的ProgressBar
- mysql查询优化技巧
- live555延时队列
- HTML加入CSS
- io 流(对象流 objectInputSteam ,objectOutStream)
- Android蓝牙开发
- ACM--DFS--poj 1562--Oil Deposits
- 共同学习Java源代码--数据结构--开篇语