python之自定义爬虫脚本
来源:互联网 发布:软件培训班哪个好学 编辑:程序博客网 时间:2024/05/16 14:30
## spider.py
import urllib2,httplibimport os,sys,re,time,random#[OK]def removeBR(site): result = re.search(r'\n',site) if result: return site.replace('\n','') return site#[OK]def getRequestList(fname): file = open(fname,"r") sites = file.readlines() list = [] for s in sites: list.append(removeBR(s)) return list#[OK]def inOldSet(url,set): if url in set: return True return False#[]#[OK]def isEndWithJavascript(site): ret = re.search("/javascript:",site) if ret: return True return False#[OK]def ignoreJavascript(site): if isEndWithJavascript(site): s1,s2 = site.rsplit("/javascript:",1) return s1 return site#[50%_OK]def isFile(url): res = re.search("(.html|.htm|.xml|.txt|.css|.js|.avi|.flv|.jpg|.gif|.bmp|.png|.xhtml|.dat|.doc|.xls|.php|.jsp|.asp)$",url) if res: return True return False#[OK] http://aa.com/index.txt -> http://aa.comdef getBaseUrl(url): if isFile(url): s1,s2 = url.rsplit("/",1) return s1 if url[len(url) - 1:] == "/": return url[:len(url) - 1] else: return url#[OK]def addIndexForSite(site): if isFile(site): return site bs = getBaseUrl(site) return "%s/index.html" % (bs)#[OK]def isError(site): try: getCntList(site) return False except: return True#[OK]def isPHPIndex(site): site = addIndexForSite(site) if isError(site): return True else: return False#[OK]def pathToFile(site): if isFile(site): return site bs = getBaseUrl(site) ext = "index.html" if isPHPIndex(site): ext = "index.php" return "%s/%s" % (bs,ext)#[OK] ./a/c.txt /a/b.txt a/v.txt -> a/a.txtdef getRelPath(sub): if sub[:1] == ".": if sub[1:2] == ".": return sub if sub[1:2] == "/": return sub[2:] return sub return sub#[OK]def getDomain(url): if isFullPath(url): s1,s2 = url.split("http://",1) if s2.find(r'/') != -1: s3,s4 = s2.split(r'/',1) return s3 else: return s2#[OK]def toFullPath(sub,parent): if isFullPath(sub): return sub if sub == "/": return "http://%s" % (getDomain(parent)) if sub[:1] == "/": baseUrl = getDomain(parent) else: baseUrl = getBaseUrl(parent) relPath = getRelPath(sub) return "http://%s/%s" % (baseUrl,relPath)#[OK]def formatUrl(url): url = ignoreJavascript(url) #if not isFile(url): # url = pathToFile(url) return url#[OK]def isFullPath(url): if re.match("\s*http://",url): return True return False#[OK]def isPHPFile(site): ret = re.search("\.php\?",site) if ret: return True ret = re.search("\.php$",site) if ret: return True return False#[OK?]def isValidHTML(url): if isPHPFile(url) or not isFullPath(url): return False return Truedef getCntList(url): list = [] try: cnt = urllib2.urlopen(url,timeout=240).read() cnt = cnt.replace('\n','') cnt = cnt.replace('<a ','\n<a ') cnt = cnt.replace('<img ','\n<img ') sites = cnt.split('\n') for s in sites: if s: #print removeBR(s) + "\n\n" list.append(removeBR(s)) except: print "load error~2" return list#[OK]def getAnchorHref(str): return re.finditer(r'<a\s.*href="([^<>\s\"]+)"',str)def getImgSrc(str): return re.finditer(r'<img\s.*src="([^<>\s\"]+)"',str)#[OK]def isImage(url): res = re.search("(.jpg|.JPG|.gif|.GIF|.bmp|.BMP|.png|.PNG|.jpeg|.JPEG)$",url) if res: return True else: return False#[OK]def isHTML(url): res = re.search("(.html|.htm|.asp|.aspx)$",url) if res: return True else: return Falsegcnt = 0#[]def parseHTML(url): global gcnt gcnt = gcnt + 1 print "parsing[%d]-->%s" % (gcnt,url) hSet = [] iSet = [] res = [] list = getCntList(url) for s in list: anchor = getAnchorHref(s) for a in anchor: print "anchor[sub]->%s" % (a.group(1)) print "fullPath[sub]->%s" % (formatUrl(toFullPath(a.group(1),url))) res.append(formatUrl(toFullPath(a.group(1),url))) img = getImgSrc(s) for im in img: print "image[sub]->%s" % (im.group(1)) res.append(formatUrl(toFullPath(im.group(1),url))) for r in res: if isImage(r): iSet.append(r) if isHTML(r): hSet.append(r) return iSet,hSet#[OK]def updateUrlSet(urlSet,htmlSet,oldSet): for h in htmlSet: if h in oldSet: continue if h in urlSet: continue urlSet.append(h)#[OK]def getExt(url): s1,s2 = url.rsplit(".",1) return s2#[OK]def getRandomName(): return "%d_%d" % (time.time(),random.randint(0,99999))#[OK]def getPath(url): name = getRandomName() ext = getExt(url) return "%s.%s" % (name,ext)#[OK]def getUrl(url): try: return urllib2.urlopen(url,timeout=240).read() except: print "load error~" return None#[OK]def saveToRoot(str,path): newPath = os.path.join(os.getcwd(),"download") newPath = os.path.join(newPath,path) fp = open(newPath,"wb") fp.write(str) fp.close()#[OK]def saveImage(imgSet): for site in imgSet: print "Get-->%s" % (site) page = getUrl(site) if page == None: continue path = getPath(site) print "Save-->%s" % (path) saveToRoot(page,path)#[Test]def main(fname): urlSet = getRequestList(fname) oldSet = [] for url in urlSet: if inOldSet(url,oldSet): del(url) continue newUrl = formatUrl(url) #print "newUrl::" + newUrl #newUrl = ignoreJavascript(url) if isValidHTML(newUrl): imgSet,htmlSet = parseHTML(newUrl) saveImage(imgSet) oldSet.append(url) oldSet.append(newUrl) updateUrlSet(urlSet,htmlSet,oldSet) else: del(url)def test(set): for t in set: print formatUrl(removeBR(t))main('list.txt')
## list.txt
http://www.gaoxiaola.com/p/gif/index_4.html
@@用法说明
spider.py list.txt
试图从网页中抓取图片
0 0
- python之自定义爬虫脚本
- 爬虫/脚本/Python语言- 脚本
- python爬虫--scrapy 框架 之 项目外运行爬虫(用脚本运行爬虫)
- Python爬虫Handler处理器 和 自定义Opener系列之四
- 【详解】Python CH爬虫脚本
- Python爬虫之分布式爬虫
- python网页爬虫之列车时刻表的抓取(4)-完整的python脚本
- python网页爬虫之列车时刻表的抓取-完整的python脚本
- Python之网络爬虫
- Python--之网络爬虫
- python之xpath爬虫
- python之文本爬虫
- python之爬虫框架
- python爬虫之BeautifulSoup
- python爬虫之lxml
- python爬虫之cookies
- Python爬虫之Pixiv
- python爬虫之-BeautifulSoup
- Linux的less命令使用
- JAVA基础课总结八
- 5.3 - Return Statements
- 马毅与来自高维度的恩赐
- hihocoder 2015ACM-ICPC 北京赛区 Problem A. Xiongnu's Land
- python之自定义爬虫脚本
- windows下更新环境变量不需要重启系统快速生效的方法(原理篇)
- C#常用 API函数大全
- MFC程序入口分析
- activity的生命周期
- JAVA基础课总结九
- 怎样理解示波器的波形捕获率,捕获周期
- [Linux]时间相关命令
- 解惑】考研:考还是不考?