Python Web数据抓取（xpath版）

来源：互联网发布：京东美工注意事项编辑：程序博客网时间：2024/06/15 23:41
（1）采用SQLite缓存抓取的HTML页面，大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时，以后只需3分钟左右即可完成。（2）采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便，而且能够自动修正html错误语法。xpath真强大！！（3）去掉了重复的结果。
# coding:utf-8# Practice of scraping web data with xpath# by redice 2010.11.05import codecsimport sys  reload(sys)  sys.setdefaultencoding('utf-8')import urllib2from urllib2 import URLError, HTTPErrorimport zlibimport sqlite3try:    import cPickle as pickleexcept ImportError:    import pickle    conn = sqlite3.connect("html_cache.db")conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace')curs = conn.cursor()#if htmls tables not exist,create it#curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);''')curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);''')conn.commit()def serialize(value):    """convert object to a compressed pickled string to save in the db    """    #return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5))    #return sqlite3.Binary(value)    return value    def deserialize(value):    """convert compressed pickled string from database back into an object    """    #return pickle.loads(zlib.decompress(value)) if value else value    return value# Fetch the target htmldef gethtml(url):    '''Fetch the target html'''    try:        # look up the html_cache.db first        curs.execute("select * from htmls where url=?;" ,(url,))        row = curs.fetchone()        if row:            # find the target            #print deserialize(str(row[1]))            return deserialize(str(row[1]))        response = urllib2.urlopen(url)        result = response.read()        # insert into the html_cache.db        curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result)))        conn.commit()                print "saved %s into html_cache.db" % (url)                return  result    except URLError, e:        if hasattr(e, 'reason'):            print 'Failed to reach a server.'            print 'Reason: ', e.reason            return 'None'        elif hasattr(e, 'code'):            print 'The server couldn't fulfill the request.'            print 'Error code: ', e.code            return 'None'    #except:        #return 'None'    # end def gethtmlimport re#Fetch the all string matched. Return a list.def regexmatch(rule,str):    '''Fetch the all string matched. Return a list.'''    p = re.compile(rule)    return p.findall(str)#end def regexmatch# decodeHtmlEntitydef decodeHtmlEntity(s) :    '''decodeHtmlEntity'''    if s=='' or not s:       return ''    result = s        import locale    result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ")        return result# end def decodeHtmlEntity#final resultdining_db = []total = 0;#debugdebug = 0# Fetch menupalace.com's htmlprint 'Fetching html from http://menupalace.com ...'html = gethtml('http://menupalace.com')from lxml import etreeif html=='' or  html=='None':    print "Can't get them html from http://menupalace.com"    sys.exit()try:    tree = etree.HTML(html)    nodes = tree.xpath("//table[@class='n_table']")except:    f = open("log.txt","wa")    f.write(html)    print("error to resolve the html http://menupalace.com")    sys.exit()for node in nodes:    if debug and total>=10:        break;    n = node.xpath("./tr[1]/td[1]/img")    # Fetch country    country = ""    if len(n)>0:        country = decodeHtmlEntity(n[0].tail)        country = country.strip()    # Fetch all link        ls = node.xpath(".//a")    # Through all link    for l in ls:        if debug and total>=10:            break;                #city        city = decodeHtmlEntity(l.text)        city = city.strip()                prelink = l.get("href")        link = prelink + "restaurants/restaurants.aspx"        #print 'Fetching html from '+ link +' ...'        html = gethtml(link)        if html=='' or html == 'None':            print "Can't get them html from " + link            continue                try:            subtree = etree.HTML(html)            subnodes = subtree.xpath("//td[@class='frame_style_padding']")        except:            if debug:                f = open("log.txt","wa")                f.write(html)                print("error to resolve the html " + link)                sys.exit()            else:                continue                    for sn in subnodes:            if debug and total>=10:                break;                                        sls = sn.xpath(".//a")            for sl in sls:                if debug and total>=10:                    break;                                            link = prelink + "restaurants/" + sl.get("href")                print 'Fetching html from '+ link +' ...'                                html = gethtml(link)                if  html=='' or html == 'None':                    print "Can't get them html from " + link                    continue                                try:                    sstree = etree.HTML(html)                    ssnodes = sstree.xpath("//table[@width='94%'][@height='80px']")                except:                    if debug:                        f = open("log.txt","wa")                        f.write(html)                        f.write(" ")                        print("error to resolve the html" + link)                        sys.exit()                    else:                        continue                                    for ssn in ssnodes:                    if debug and total>=10:                       break;                                                #name                    n = ssn.xpath(".//tr[1]/td[1]/a[1]")                    name = ''                                        if len(n)>0:                        name = decodeHtmlEntity(n[0].text)                        name = name.strip()                        #print name                    #address                    n = ssn.xpath(".//tr[2]/td[1]")                    #address array                                        address_arr =[]                                        address = ''                    state = ''                    if len(n)>0:                        address = decodeHtmlEntity(n[0].text)                        #has many locations                                                if address.strip()=='Various Locations':                            n = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]")                            if len(n)>0:                                                               address = decodeHtmlEntity(n[0].text)                               addrlist = address.split()                               if len(addrlist)>4:                                    state = addrlist[-2]                                    city = addrlist[-3]                                    #remove state and city from the address                                    address = address.replace(state,'')                                    address = address.replace(city,'')                                    address = address.replace(addrlist[-1],'')                                    address = address.strip()                                    address_arr.append((address,city,state))                                                                        brn = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]/br")                                    for n in brn:                                        address = decodeHtmlEntity(n.tail)                                        addrlist = address.split()                                        if len(addrlist)>4:                                            state = addrlist[-2]                                            city = addrlist[-3]                                            #remove state and city from the address                                            address = address.replace(state,'')                                            address = address.replace(city,'')                                            address = address.replace(addrlist[-1],'')                                            address = address.strip()                                            address_arr.append((address,city,state))                                                            else:                                address_arr.append(('','',''))                        else:                                     addrlist = address.split()                            if len(addrlist)>3:                                state = addrlist[-1]                                city = addrlist[-2]                                #remove state and city from the address                                address = address.replace(state,'')                                address = address.replace(city,'')                                address = address.strip()                                address_arr.append((address,city,state))                    #website                    website = ''                    n = ssn.xpath(".//tr[3]/td[1]/a[1]")                    if len(n)>0:                        website = decodeHtmlEntity(n[0].text)                        website = website.strip()                    if name and len(address)>0:                        for addr in address_arr:                             dining = {}                             dining['name'] = name                             if addr[0] == 'Various Locations':                                 dining['address'] = ''                             else:                                 dining['address'] = addr[0]                             dining['city'] = addr[1]                             dining['state'] = addr[2]                             dining['country'] = country                             dining['website'] = website                             # Avoid duplication                             if not (dining in dining_db):                                 dining_db.append(dining)                                 total = total + 1                             if debug and total>=10:                                 break;                                    #Close database linkconn.close()       #print and save the final resultimport csvcf = open("scraping_result.csv", "w")writer = csv.writer(cf)writer.writerow(['name','address','city','state','country','website'])for  item in dining_db:     #print item['name'],item['address'],item['city'],item['state'],item['country'],item['website']     rlist=[]     rlist.append(item['name'])     rlist.append(item['address'])     rlist.append(item['city'])     rlist.append(item['state'])     rlist.append(item['country'])     rlist.append(item['website'])     writer.writerow(rlist)cf.close()print 'The result has been saved into scraping_result.csv!'