使用requests和re库对淘宝商品信息进行定向爬取

来源:互联网 发布:黄金烤鸡腿堡 知乎 编辑:程序博客网 时间:2024/06/06 08:47
#coding=utf-8import requestsimport redef getHtmlText(url):    try:        r= requests.get(url, timeout = 30)        r.raise_for_status()        r.encoding = r.apparent_encoding        return r.text    except:        return " "def parsePage(list, html):    try:        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)        for i in range(len(plt)):            price = eval(plt[i].split(':')[1])            title = eval(tlt[i].split(':')[1])            list.append([price, title])    except:        print("")def getGoodsList(list):    tplt = "{:4}\t{:8}\t{:16}"    print(tplt.format("序号", "价格", "商品名称"))    count = 0    for g in list:        print(tplt.format(count, g[0], g[1]))def main():    goods = "书包"    depth = 2    start_url = "https://s.taobao.com/search?q=" + goods    infoList = []    for i in range(depth):        try :            url = start_url + "&s=" + str(44 * i)            html = getHtmlText(url)            parsePage(infoList, html)        except:            continue    getGoodsList(infoList)if __name__ == '__main__':    main()