python之自定义爬虫脚本

来源:互联网 发布:软件培训班哪个好学 编辑:程序博客网 时间:2024/05/16 14:30

## spider.py


import urllib2,httplibimport os,sys,re,time,random#[OK]def removeBR(site):    result = re.search(r'\n',site)    if result:        return site.replace('\n','')    return site#[OK]def getRequestList(fname):    file = open(fname,"r")    sites = file.readlines()    list = []    for s in sites:        list.append(removeBR(s))    return list#[OK]def inOldSet(url,set):    if url in set:        return True    return False#[]#[OK]def isEndWithJavascript(site):    ret = re.search("/javascript:",site)    if ret:        return True    return False#[OK]def ignoreJavascript(site):    if isEndWithJavascript(site):        s1,s2 = site.rsplit("/javascript:",1)        return s1    return site#[50%_OK]def isFile(url):    res = re.search("(.html|.htm|.xml|.txt|.css|.js|.avi|.flv|.jpg|.gif|.bmp|.png|.xhtml|.dat|.doc|.xls|.php|.jsp|.asp)$",url)    if res:        return True    return False#[OK] http://aa.com/index.txt -> http://aa.comdef getBaseUrl(url):    if isFile(url):        s1,s2 = url.rsplit("/",1)        return s1    if url[len(url) - 1:] == "/":        return url[:len(url) - 1]    else:        return url#[OK]def addIndexForSite(site):    if isFile(site):        return site    bs = getBaseUrl(site)    return "%s/index.html" % (bs)#[OK]def isError(site):    try:        getCntList(site)        return False    except:        return True#[OK]def isPHPIndex(site):    site = addIndexForSite(site)    if isError(site):        return True    else:        return False#[OK]def pathToFile(site):    if isFile(site):        return site    bs = getBaseUrl(site)    ext = "index.html"    if isPHPIndex(site):        ext = "index.php"    return "%s/%s" % (bs,ext)#[OK] ./a/c.txt  /a/b.txt a/v.txt -> a/a.txtdef getRelPath(sub):    if sub[:1] == ".":        if sub[1:2] == ".":            return sub        if sub[1:2] == "/":            return sub[2:]        return sub    return sub#[OK]def getDomain(url):    if isFullPath(url):        s1,s2 = url.split("http://",1)        if s2.find(r'/') != -1:            s3,s4 = s2.split(r'/',1)            return s3        else:            return s2#[OK]def toFullPath(sub,parent):    if isFullPath(sub):        return sub    if sub == "/":        return "http://%s" % (getDomain(parent))    if sub[:1] == "/":        baseUrl = getDomain(parent)    else:        baseUrl = getBaseUrl(parent)    relPath = getRelPath(sub)    return "http://%s/%s" % (baseUrl,relPath)#[OK]def formatUrl(url):    url = ignoreJavascript(url)    #if not isFile(url):    #    url = pathToFile(url)    return url#[OK]def isFullPath(url):    if re.match("\s*http://",url):        return True    return False#[OK]def isPHPFile(site):    ret = re.search("\.php\?",site)    if ret:        return True    ret = re.search("\.php$",site)    if ret:        return True    return False#[OK?]def isValidHTML(url):    if isPHPFile(url) or not isFullPath(url):        return False    return Truedef getCntList(url):    list = []    try:        cnt = urllib2.urlopen(url,timeout=240).read()        cnt = cnt.replace('\n','')        cnt = cnt.replace('<a ','\n<a ')        cnt = cnt.replace('<img ','\n<img ')        sites = cnt.split('\n')        for s in sites:            if s:                #print removeBR(s) + "\n\n"                list.append(removeBR(s))    except:        print "load error~2"    return list#[OK]def getAnchorHref(str):    return re.finditer(r'<a\s.*href="([^<>\s\"]+)"',str)def getImgSrc(str):    return re.finditer(r'<img\s.*src="([^<>\s\"]+)"',str)#[OK]def isImage(url):    res = re.search("(.jpg|.JPG|.gif|.GIF|.bmp|.BMP|.png|.PNG|.jpeg|.JPEG)$",url)    if res:        return True    else:        return False#[OK]def isHTML(url):    res = re.search("(.html|.htm|.asp|.aspx)$",url)    if res:        return True    else:        return Falsegcnt = 0#[]def parseHTML(url):    global gcnt    gcnt = gcnt + 1    print "parsing[%d]-->%s" % (gcnt,url)    hSet = []    iSet = []    res = []    list = getCntList(url)    for s in list:        anchor = getAnchorHref(s)        for a in anchor:            print "anchor[sub]->%s" % (a.group(1))            print "fullPath[sub]->%s" % (formatUrl(toFullPath(a.group(1),url)))            res.append(formatUrl(toFullPath(a.group(1),url)))        img = getImgSrc(s)        for im in img:            print "image[sub]->%s" % (im.group(1))            res.append(formatUrl(toFullPath(im.group(1),url)))    for r in res:        if isImage(r):            iSet.append(r)        if isHTML(r):            hSet.append(r)    return iSet,hSet#[OK]def updateUrlSet(urlSet,htmlSet,oldSet):    for h in htmlSet:        if h in oldSet:            continue        if h in urlSet:            continue        urlSet.append(h)#[OK]def getExt(url):    s1,s2 = url.rsplit(".",1)    return s2#[OK]def getRandomName():    return "%d_%d" % (time.time(),random.randint(0,99999))#[OK]def getPath(url):    name = getRandomName()    ext = getExt(url)    return "%s.%s" % (name,ext)#[OK]def getUrl(url):    try:        return urllib2.urlopen(url,timeout=240).read()    except:        print "load error~"        return None#[OK]def saveToRoot(str,path):    newPath = os.path.join(os.getcwd(),"download")    newPath = os.path.join(newPath,path)    fp = open(newPath,"wb")    fp.write(str)    fp.close()#[OK]def saveImage(imgSet):    for site in imgSet:        print "Get-->%s" % (site)        page = getUrl(site)        if page == None:            continue        path = getPath(site)        print "Save-->%s" % (path)        saveToRoot(page,path)#[Test]def main(fname):    urlSet = getRequestList(fname)    oldSet = []    for url in urlSet:        if inOldSet(url,oldSet):            del(url)            continue        newUrl = formatUrl(url)        #print "newUrl::" + newUrl        #newUrl = ignoreJavascript(url)        if isValidHTML(newUrl):            imgSet,htmlSet = parseHTML(newUrl)            saveImage(imgSet)            oldSet.append(url)            oldSet.append(newUrl)            updateUrlSet(urlSet,htmlSet,oldSet)        else:            del(url)def test(set):    for t in set:        print formatUrl(removeBR(t))main('list.txt')


## list.txt

http://www.gaoxiaola.com/p/gif/index_4.html


@@用法说明

spider.py list.txt

试图从网页中抓取图片

0 0
原创粉丝点击