zhkmjj

来源:互联网 发布:name.com 域名转出 编辑:程序博客网 时间:2024/05/17 21:44
# -*- coding: utf-8 -*-from configparser import ConfigParserfrom urllib.parse import quoteimport socketimport osimport mathimport urllib.requestfrom bs4 import BeautifulSoupimport timeimport spider_search_pageimport spider_paperif __name__ == '__main__':    print("123")    start = time.clock()    cf = ConfigParser()    cf.read("Config.conf", encoding='utf-8')    keyword = cf.get('base', 'keyword')# 关键词    maxpage = cf.getint('base', 'maxpage')# 最大页码    searchlocation = cf.get('base', 'searchlocation') #搜索位置    currentpage = cf.getint('base', 'currentpage')    if os.path.exists('data-detail.txt') and currentpage == 0:        print('存在输出文件,删除该文件')        os.remove('data-detail.txt')    #构造不同条件的关键词搜索    values = {           '全文': 'qw',           '主题': 'theme',           '篇名': 'title',           '作者': 'author',           '摘要':'abstract'    }    keywordval = str(values[searchlocation])+':'+str(keyword)    index_url='http://search.cnki.com.cn/Search.aspx?q='+quote(keywordval)+'&rank=&cluster=&val=&p='#quote方法把汉字转换为encodeuri?    print(index_url)    #获取最大页数    html = urllib.request.urlopen(index_url).read()    soup = BeautifulSoup(html, 'html.parser')    pagesum_text = soup.find('span', class_='page-sum').get_text()    maxpage = math.ceil(int(pagesum_text[7:-1]) / 15)    #print(maxpage)    cf = ConfigParser()    cf.read("Config.conf", encoding='utf-8')    cf.set('base', 'maxpage', str(maxpage))    cf.write(open('Config.conf', 'w', encoding='utf-8'))    for i in range(currentpage, maxpage):        page_num=15        page_str_num=i*page_num        page_url=index_url+str(page_str_num)        print(page_url)        attempts = 0        success = False        while attempts < 50 and not success:            try:                spider_search_page.get_paper_url(page_url)                socket.setdefaulttimeout(10)  # 设置10秒后连接超时                success = True            except socket.error:                attempts += 1                print("第"+str(attempts)+"次重试!!")                if attempts == 50:                    break            except urllib.error:                attempts += 1                print("第"+str(attempts)+"次重试!!")                if attempts == 50:                    break        cf.set('base', 'currentpage', str(i))        cf.write(open("Config.conf", "w", encoding='utf-8'))    spider_paper.spider_paper()# spider_paper补全文章信息    end = time.clock()    print ('Running time: %s Seconds'%(end-start))
# -*- coding: utf-8 -*-import socketfrom bs4 import BeautifulSoupimport urllibimport requestsimport timeimport xlwtfrom configparser import ConfigParserdef spider_paper():    start = time.clock()    # f=urllib2.urlopen(url, timeout=5).read()    # soup=BeautifulSoup(html)    # tags=soup.find_all('a')    file = open("data-detail.txt", encoding='utf8')    cf = ConfigParser()    cf.read("Config.conf", encoding='utf-8')    keyword = cf.get('base', 'keyword')# 关键词# 写入Excel    wb = xlwt.Workbook("data_out.xls")    sheet = wb.add_sheet("data-out")    sheet.write(0, 0, '下载网址')    sheet.write(0, 1, '标题')    sheet.write(0, 2, '来源')    sheet.write(0, 3, '引用')    sheet.write(0, 4, '作者')    sheet.write(0, 5, '作者单位')    sheet.write(0, 6, '关键词')    sheet.write(0, 7, '摘要')    sheet.write(0, 8, '共引文献')    lines = file.readlines()    txt_num = 1    lin_num = 1    paper_list = []    for line in lines:        object = line.split('\t')        paper_url = object[0]        if paper_url in paper_list:            continue        paper_list.append(paper_url)        attempts = 0        success = False        while attempts < 50 and not success:            try:                html = urllib.request.urlopen(paper_url).read()                soup = BeautifulSoup(html, 'html.parser')                socket.setdefaulttimeout(10)  # 设置10秒后连接超时                success = True            except socket.error:                attempts += 1                print("第"+str(attempts)+"次重试!!")                if attempts == 50:                    break            except urllib.error:                attempts += 1                print("第"+str(attempts)+"次重试!!")                if attempts == 50:                    break        title = soup.find_all('div', style="text-align:center; width:740px; font-size: 28px;color: #0000a0; font-weight:bold; font-family:'宋体';")        abstract = soup.find_all('div', style='text-align:left;word-break:break-all')        author = soup.find_all('div', style='text-align:center; width:740px; height:30px;')        #获取作者名字        for item in author:            author = item.get_text()        # print(item)        #获取摘要信息        tmp = ''        for thing in abstract:            a = thing.strings            for string in a:                tmp = tmp + string            txt_num += 1        result = tmp.split(' ')        tstr = ''        for t in result:            test = t.split('\n')            # print(test)            if test != '\t' and test != '\n' and test != '\r' and test != '':                for i in test:                    if len(i) > 1:                        item = i.split('\r')                        for j in item:                            object = j.split('\t')                            for k in object:                                tstr += k        ifreferen = soup.find_all('td', class_='b14', rowspan='2')        ref = ''        for i in range(len(ifreferen)):            if ('【共引文献】' in ifreferen[i].get_text()):                referenceList = soup.find_all('div', id='div_Ref')  # 共引文献列表                if len(referenceList) == 0:                    referenceList = soup.find_all('div', class_='div_Ref')                referenceList = referenceList[i]                for tdref in referenceList.find_all('td', width='676'):                    refitem = tdref.a.get("href")                    refitem = refitem.strip()                    print(refitem)                    ref = ref + refitem + ' ,'        # 获取作者单位,处理字符串匹配        authorUnitScope = soup.find('div', style='text-align:left;', class_='xx_font')        author_unit = ''        author_unit_text = authorUnitScope.get_text()        # print(author_unit_text)        if '【作者单位】:' in author_unit_text:            auindex = author_unit_text.find('【作者单位】:', 0)        else:            auindex = author_unit_text.find('【学位授予单位】:', 0)        for k in range(auindex, len(author_unit_text)):            if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \                        author_unit_text[k] == '】':                continue            if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':                continue            if author_unit_text[k] != '【':                author_unit = author_unit + author_unit_text[k]            if author_unit_text[k] == '【' and k != auindex:                break        # 获取关键字        key_word = ''        kwindex = author_unit_text.find('【关键词】:', 0)        for k in range(kwindex, len(author_unit_text)):            if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \                                author_unit_text[k] == '】':                continue            if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':                continue            if author_unit_text[k] != '【':                key_word = key_word + author_unit_text[k]            if author_unit_text[k] == '【' and k != kwindex:                break        # print(author_unit)        # print(key_word)        line = line.strip('\n')        line = line + '\t' + str(author) + '\t' + str(author_unit) + '\t'+ str(key_word) + '\t'+ str(tstr) + '\t' + str(ref) + '\n'        outstring = line.split('\t')        for i in range(len(outstring)):            sheet.write(lin_num, i, outstring[i])        print('写入第'+str(lin_num)+'行')        lin_num += 1        wb.save('data_out_'+str(keyword)+'.xls')    file.close()    end = time.clock()    print('Running time: %s Seconds' % (end - start))

from bs4 import BeautifulSoupimport urllibimport urllib.requestimport sysimport iosys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')page_num=15def get_paper_url(page_url):    html = urllib.request.urlopen(page_url).read()    soup = BeautifulSoup(html,'html.parser')    f = open('data-detail.txt','a+', encoding='utf-8')    all = soup.find_all('div', class_='wz_content')    for string in all:        item = string.find('a', target='_blank')#文章标题与链接        href = item.get('href')# 获取文章url        title = item.get_text() # 获取文章标题        year_count = string.find('span', class_='year-count')#获取文章出处与引用次数        #year_count = year_count.get_text()        publish = ''        reference = ''        for item in year_count:            item = item.string            item = item.replace('\n','')            item = item.replace('\r', '')            if '被引次数' in item:                reference = item# 获取被引次数            elif '年' in item: # 获取文章出处                publish = item            #print(publish)            #print(reference)        #print(year_count)        f.write(href + '\t' + title + '\t' + publish + '\t' + reference +'\n')    f.close()