爬虫基本操作——从文件中批量读取需求,查询后,批量写入Excel表中

来源:互联网 发布:java web权限控制框架 编辑:程序博客网 时间:2024/06/02 01:01

爬虫基本操作——从文件中批量读取需求,查询后,批量写入Excel表中

Mark下,防遗忘

代码如下(有注释)

# code by sunkun 20170417# -*- coding: UTF-8 -*-import urllibimport urllib2import reimport csvimport codecsfile = open("categorylist.txt") # 需要查询的关键词按行放在文件里resultFile = open('C:\Users\sunkun\Desktop\categorylist_type.csv','wb')  # 输出的文件。 # resultFile2 = open('C:\Users\sunkun\Desktop\pinpai2.csv','a+') resultFile.write(codecs.BOM_UTF8) # 防止中文乱码fieldname = ['keywords', 'type'] # 返回文件,第一列放关键词,第二列放查询到的返回类型。writer = csv.DictWriter(resultFile, fieldnames=fieldname)# writer.writeheader() # 是否写上表头while 1:    line = file.readline()    if not line:        break    # print line    keyWords = (urllib).quote(line)     # url不支持中文,进行转码操作    # keyWords = line.encode('utf-8')    # line = '新闻软件'    # url = 'http://quickshare.swc.sogou/quickshared?content=Clapton&class=null&part=0&platform=iOS&location=116.326022%7C39.992892&id=1481119148606063034&naming=raphael&framework=raphael'    urlStart = 'http://quickshare.swc.sogou/quickshared?content='    urlEnd = '&class=null&part=0&platform=iOS&location=116.326022%7C39.992892&id=1481119148606063034&naming=raphael&framework=raphael'    url = urlStart + keyWords + urlEnd    try:        request = urllib2.Request(url)        response = urllib2.urlopen(request)        content = response.read().decode('gbk')        pattern = re.compile('<returntype>(.*?)</returntype>', re.S)        resultType = re.findall(pattern, content)        # print content        print line, resultType[0]  # 测试        writer.writerow({'keywords':line, 'type':resultType[0]}) # 写入到文件中        # singleResult = ""+line+resultType[0]        # singleResult = [line, resultType[0]]        # print singleResult        # writer = csv.writer(resultFile)        # resultFile.writelines(singleResult)        # resultFile.write(line+'\t')        # resultFile.write("\t")        # resultFile.write(resultType[0])        # resultFile.write("\n")        # resultFile.write(resultType[0])        # resultFile1.writelines(line)        # resultFile2.writelines(resultType[0])        # resultFile2.writelines("\n")    except urllib2.URLError, e: # 异常处理        if hasattr(e, "code"):            print e.code        if hasattr(e, "reason"):            print e.reason
0 0
原创粉丝点击