XPath爬取百度搜索结果

来源:互联网 发布:售电软件报价 编辑:程序博客网 时间:2024/05/23 02:20

webpy + nginx框架

主要涉及到lxml中的xpath模块解析html格式数据

各种编码问题

部分XPath实现:

<def parse_baidu(self, body):print("parse_baidu ===>>")elements = []try:html = bodypage = etree.HTML(html.lower().decode('utf-8')) # need convert to utf-8tags = page.xpath(u"//div[@class='result c-result'] | //div[@class='result c-result c-clk-recommend']")for tag in tags:#get linkdata_log_attrib = tag.attrib['data-log']if data_log_attrib:data_log_attrib = data_log_attrib.replace('\'', '"')data_log = json.loads(data_log_attrib)if data_log:node = {}node['order'] = data_log['order']node['link'] = data_log['mu']# get title_tag = tag.xpath(u"./div/a/h3")if len(_tag):_tag_str = _tag[0].xpath("string(.)")node['title'] = _tag_str.encode('utf-8')else:node['title'] = "Unknown"# get description_tag = tag.xpath(u"./div/div")if len(_tag):_tag_str = _tag[0].xpath("string(.)")_string = _tag_str.encode("utf-8")node['desc'] = _stringelse:node['desc'] = "Unknown"elements.append(node)print("parse_baidu <<===")return elementsexcept Exception as e:print("parse_baidu failed {}".format(e))print("parse_baidu <<=== end")return None
def query_baidu(self, keyword):print("query_baidu ===>>")try:_keyword = urllib.quote(keyword.encode('utf-8'))url_query = "https://m.baidu.com/from=1014284b/s?word=" + _keyword + "&sa=tb&ts=6902153&t_kt=0&ie=utf-8&rsv_t=9e926S4zLuzG32Q2kkM5Tu%252Bc%252B4TbHKAg9WiWPQfnflJUbt8%252BiCpIrXI%252FyApB%252FeM&rsv_pq=17372243552828969370&ss=111&rsv_sug4=14106&inputT=12708&oq=%E4%B9%A0%E8%BF%91%E5%B9%B3"request = urllib2.Request(url_query)response = urllib2.urlopen(request)body = response.read()## TODO: parse the body content, extract items (some links)#elements = self.parse_baidu(body)return elementsexcept Exception as e: print("Query Baidu Error: {}".format(e))return Nonedef query_google(self, keyword):return "Not implemented!"def GET(self):response = Nonearguments = {}try:key_word = web.input(keyword = "demo_keyword")arguments['keyword'] = key_word.keywordarguments['extern_ip'] = "{}:{}".format(web.ctx.ip, web.ctx.env["REMOTE_PORT"])elements = self.query_baidu(key_word.keyword)response = self.render.template_index(arguments, elements)except Exception as e:response = "GET Error: {}".format(e)return responsedef POST(self):pass


html模版页面部分实现:

<div id="result"> $if elements:$for e in elements:<h3>$e['order']</h3>$if e['link'] and e['title']:<li><a href="$e['link']">$e['title']</a></li>$if e['desc']:<li>$e['desc']</li>$else:<h3>Not get values!</h3><p></div>


0 0
原创粉丝点击