爬取WAP 百度贴吧和凤凰财经的A股列表

来源:互联网 发布:软件测试工作报告 编辑:程序博客网 时间:2024/04/28 22:58

这两个爬虫都是三个月前刚开始接触PYTHON时仿照GITHUB案例写的,写的比较乱,也没有IP代理和停机TIME SLEEP,第一个用到了XPATH,后面一个用了正则表达式。到现在我依然记得第一次成功按自己的正则匹配到数据的兴奋。至于保存数据,前者是保存到了TXT文档,后者存到了数据库MYSQL的各列。

现在爬过那么网站后,我会选择REQUESTS 包和BEAUTIFUL SOUP4包,这两个方法是真的很方便。

------------

爬取WAP百度贴吧,保存到TXT


# -*- coding:utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import urllib import urllib2 import requests import sys reload(sys) sys.setdefaultencoding("utf-8")   from lxml import etree import json     def spider(): i=0 yulu=[] for i in range(0,38): url='http://tieba.baidu.com/mo/q---E69E1F2CE3B3F602E8A4E9DBB498F420%3AFG%3D1--1-1-0--2--wapp_1492841547755_799/m?kz=4668253092&new_word=&pn={}0&lp=6005'.format(i) html=requests.get(url) select=etree.HTML(html.content) content_field=select.xpath('//div[@class="d"]') print u'新一页'   items=list() for each in content_field: content=each.xpath('//div[@class="i"]/text()')   items=content for j in range(0,30): print items[j] yulu.append(items[j]) print yulu with open('data2.txt', 'wb') as f: for item in yulu: line =item + '\n' f.write(line.encode('utf-8'))     spider()


------------

爬取凤凰财经的A股列表

#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib import urllib2 import re import thread import time import MySQLdb   class FH:   def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)' self.headers = {'User-Agent' :self.user_agent} self.list = []     def getPage(self,pageIndex): try: url = 'http://app.finance.ifeng.com/list/stock.php?t=ha&f=chg_pct&o=desc&p='+ str(pageIndex) request = urllib2.Request(url,headers=self.headers) response = urllib2.urlopen(request) pageCode = response.read().decode('utf-8'return pageCode except urllib2.URLError,e: if hasattr(e,"reason"): print "error",e.reason return None   def getPageItems(self,pageIndex): pageCode = self.getPage(pageIndex) if not pageCode: print "page load error" return None pattern = re.compile('<td><a href="(.*?)" target="_blank">(.*?)</a></td>.*?target="_blank">(.*?)</a></td>',re.S)   items = re.findall(pattern,pageCode) pagelist = [] for item in items: pagelist.append([item[0].strip(),item[1].strip(),item[2].strip()]) print(item[0]) print(item[1]) print(item[2]) conn= MySQLdb.connect( host='localhost'port = 3306user='root'passwd='94159415'db ='movie'charset='utf8' ) cur = conn.cursor()   cur.execute("insert into A_STOCK_LIST VALUES (NULL,'%s','%s','%s')"%(item[0],item[1],item[2]))   cur.close() conn.commit() conn.close() return pagelist   def loadPage(self): if len(self.list)<2: pagelist = self.getPageItems(self.pageIndex) if pagelist: self.list.append(pagelist) self.pageIndex +=1     def start(self): print u'正在读取' self.loadPage() nowPage = 0 pagelist = self.list[0while nowPage<24: nowPage +=1 del self.list[0self.loadPage()     spider = FH() spider.start()

原创粉丝点击