Web_parser
来源:互联网 发布:线性变换旋转矩阵 编辑:程序博客网 时间:2024/06/03 19:21
# -*- coding:utf-8 _*-
import sys
import os
import math
import threading
import urllib
import urllib2
import re
import threading
import socket
import codecs
import time
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
reload(sys)
sys.setdefaultencoding('utf8')
base_template = ""
new_template = ""
def extract_segment(data):
pat = re.compile(r"<td>\d+</td><td>(.+?)</td><td>POS_")
term_list = re.findall(pat, data)
return "|".join(term_list)
class QueryFeature(object):
def __init__(self,query):
self.query = query
self.new_segment =""
self.base_segment = ""
#global variables
g_lock = Lock()
g_total_task = []
#ok
def prepare_tasks(fname):
for line in open(fname):
line = line.strip()
if not line:
continue
qf = QueryFeature(line)
g_total_task.append(qf)
def pipeline():
fname = sys.argv[1]
prepare_tasks(fname)
multi_get()
def craw_page(url,query):
encoded_query = urllib.urlencode({"kw":query.encode("UTF-8")})
f = None
page_html = None
for i in range(10):
try:
if not f:
f = urllib2.urlopen(url + encoded_query,timeout=100)
except:
continue
page_html = f.read()
if page_html.find("Raw query") < 0:
continue
else:
return (True,page_html)
return (False,None)
#ok
def fill_data(total_task,begin,end,out):
for i in range(begin,end):
qf = total_task[i]
query = qf.query
res1 = craw_page(base_template,query)
if not res1[0]:
continue
res2 = craw_page(new_template,query)
if not res2[0]:
continue
qf.base_segment = extract_segment(res1[1])
qf.new_segment = extract_segment(res2[1])
with g_lock:
out.put(qf)
#ok
def multi_get():
#split and work in thread
Kthread = 30
load = len(g_total_task)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
th = Process(target=fill_data,args=(g_total_task,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
#test
while not out.empty():
qf = out.get()
if qf.base_segment != qf.new_segment:
print "%s\t%s\t%s" %(qf.query,qf.base_segment,qf.new_segment)
pipeline()
import sys
import os
import math
import threading
import urllib
import urllib2
import re
import threading
import socket
import codecs
import time
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
reload(sys)
sys.setdefaultencoding('utf8')
base_template = ""
new_template = ""
def extract_segment(data):
pat = re.compile(r"<td>\d+</td><td>(.+?)</td><td>POS_")
term_list = re.findall(pat, data)
return "|".join(term_list)
class QueryFeature(object):
def __init__(self,query):
self.query = query
self.new_segment =""
self.base_segment = ""
#global variables
g_lock = Lock()
g_total_task = []
#ok
def prepare_tasks(fname):
for line in open(fname):
line = line.strip()
if not line:
continue
qf = QueryFeature(line)
g_total_task.append(qf)
def pipeline():
fname = sys.argv[1]
prepare_tasks(fname)
multi_get()
def craw_page(url,query):
encoded_query = urllib.urlencode({"kw":query.encode("UTF-8")})
f = None
page_html = None
for i in range(10):
try:
if not f:
f = urllib2.urlopen(url + encoded_query,timeout=100)
except:
continue
page_html = f.read()
if page_html.find("Raw query") < 0:
continue
else:
return (True,page_html)
return (False,None)
#ok
def fill_data(total_task,begin,end,out):
for i in range(begin,end):
qf = total_task[i]
query = qf.query
res1 = craw_page(base_template,query)
if not res1[0]:
continue
res2 = craw_page(new_template,query)
if not res2[0]:
continue
qf.base_segment = extract_segment(res1[1])
qf.new_segment = extract_segment(res2[1])
with g_lock:
out.put(qf)
#ok
def multi_get():
#split and work in thread
Kthread = 30
load = len(g_total_task)
quota = load/Kthread
remain = load-quota*Kthread
threads = []
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
out = manager.Queue()
for i in range(Kthread):
begin = i*quota
if i != Kthread-1:
end = (i+1)*quota
else:
end = (i+1)*quota + remain
th = Process(target=fill_data,args=(g_total_task,begin,end,out))
th.daemon = True
th.start()
threads.append(th)
for i in range(Kthread):
threads[i].join()
#test
while not out.empty():
qf = out.get()
if qf.base_segment != qf.new_segment:
print "%s\t%s\t%s" %(qf.query,qf.base_segment,qf.new_segment)
pipeline()
阅读全文
0 0
- Web_parser
- SQL INSERT INTO
- 深入理解DOM事件类型系列第八篇——变动事件
- 调试rviz,并解决问题“For frame [laser]: Fixed Frame [map] does not exist”
- Masonry使用[草稿]
- 超华丽的HTML5 Canvas文字动画特效
- Web_parser
- 网站的浏览器缓存设置
- HBase的RowKey设计原则
- java性能监控工具MoSKito学习--一步一步开始工作1,2
- 入门springboot报错org.apache.coyote.http11.AbstractHttp11Protocol.setCompressableMimeTypes
- Dijkstra算法1.0
- Annotation processors must be explicitly declared now
- bzoj2716 [Violet 3]天使玩偶(CDQ分治)
- Redis实现分布式session功能的共享