zhkmjj
来源:互联网 发布:name.com 域名转出 编辑:程序博客网 时间:2024/05/17 21:44
# -*- coding: utf-8 -*-from configparser import ConfigParserfrom urllib.parse import quoteimport socketimport osimport mathimport urllib.requestfrom bs4 import BeautifulSoupimport timeimport spider_search_pageimport spider_paperif __name__ == '__main__': print("123") start = time.clock() cf = ConfigParser() cf.read("Config.conf", encoding='utf-8') keyword = cf.get('base', 'keyword')# 关键词 maxpage = cf.getint('base', 'maxpage')# 最大页码 searchlocation = cf.get('base', 'searchlocation') #搜索位置 currentpage = cf.getint('base', 'currentpage') if os.path.exists('data-detail.txt') and currentpage == 0: print('存在输出文件,删除该文件') os.remove('data-detail.txt') #构造不同条件的关键词搜索 values = { '全文': 'qw', '主题': 'theme', '篇名': 'title', '作者': 'author', '摘要':'abstract' } keywordval = str(values[searchlocation])+':'+str(keyword) index_url='http://search.cnki.com.cn/Search.aspx?q='+quote(keywordval)+'&rank=&cluster=&val=&p='#quote方法把汉字转换为encodeuri? print(index_url) #获取最大页数 html = urllib.request.urlopen(index_url).read() soup = BeautifulSoup(html, 'html.parser') pagesum_text = soup.find('span', class_='page-sum').get_text() maxpage = math.ceil(int(pagesum_text[7:-1]) / 15) #print(maxpage) cf = ConfigParser() cf.read("Config.conf", encoding='utf-8') cf.set('base', 'maxpage', str(maxpage)) cf.write(open('Config.conf', 'w', encoding='utf-8')) for i in range(currentpage, maxpage): page_num=15 page_str_num=i*page_num page_url=index_url+str(page_str_num) print(page_url) attempts = 0 success = False while attempts < 50 and not success: try: spider_search_page.get_paper_url(page_url) socket.setdefaulttimeout(10) # 设置10秒后连接超时 success = True except socket.error: attempts += 1 print("第"+str(attempts)+"次重试!!") if attempts == 50: break except urllib.error: attempts += 1 print("第"+str(attempts)+"次重试!!") if attempts == 50: break cf.set('base', 'currentpage', str(i)) cf.write(open("Config.conf", "w", encoding='utf-8')) spider_paper.spider_paper()# spider_paper补全文章信息 end = time.clock() print ('Running time: %s Seconds'%(end-start))
# -*- coding: utf-8 -*-import socketfrom bs4 import BeautifulSoupimport urllibimport requestsimport timeimport xlwtfrom configparser import ConfigParserdef spider_paper(): start = time.clock() # f=urllib2.urlopen(url, timeout=5).read() # soup=BeautifulSoup(html) # tags=soup.find_all('a') file = open("data-detail.txt", encoding='utf8') cf = ConfigParser() cf.read("Config.conf", encoding='utf-8') keyword = cf.get('base', 'keyword')# 关键词# 写入Excel wb = xlwt.Workbook("data_out.xls") sheet = wb.add_sheet("data-out") sheet.write(0, 0, '下载网址') sheet.write(0, 1, '标题') sheet.write(0, 2, '来源') sheet.write(0, 3, '引用') sheet.write(0, 4, '作者') sheet.write(0, 5, '作者单位') sheet.write(0, 6, '关键词') sheet.write(0, 7, '摘要') sheet.write(0, 8, '共引文献') lines = file.readlines() txt_num = 1 lin_num = 1 paper_list = [] for line in lines: object = line.split('\t') paper_url = object[0] if paper_url in paper_list: continue paper_list.append(paper_url) attempts = 0 success = False while attempts < 50 and not success: try: html = urllib.request.urlopen(paper_url).read() soup = BeautifulSoup(html, 'html.parser') socket.setdefaulttimeout(10) # 设置10秒后连接超时 success = True except socket.error: attempts += 1 print("第"+str(attempts)+"次重试!!") if attempts == 50: break except urllib.error: attempts += 1 print("第"+str(attempts)+"次重试!!") if attempts == 50: break title = soup.find_all('div', style="text-align:center; width:740px; font-size: 28px;color: #0000a0; font-weight:bold; font-family:'宋体';") abstract = soup.find_all('div', style='text-align:left;word-break:break-all') author = soup.find_all('div', style='text-align:center; width:740px; height:30px;') #获取作者名字 for item in author: author = item.get_text() # print(item) #获取摘要信息 tmp = '' for thing in abstract: a = thing.strings for string in a: tmp = tmp + string txt_num += 1 result = tmp.split(' ') tstr = '' for t in result: test = t.split('\n') # print(test) if test != '\t' and test != '\n' and test != '\r' and test != '': for i in test: if len(i) > 1: item = i.split('\r') for j in item: object = j.split('\t') for k in object: tstr += k ifreferen = soup.find_all('td', class_='b14', rowspan='2') ref = '' for i in range(len(ifreferen)): if ('【共引文献】' in ifreferen[i].get_text()): referenceList = soup.find_all('div', id='div_Ref') # 共引文献列表 if len(referenceList) == 0: referenceList = soup.find_all('div', class_='div_Ref') referenceList = referenceList[i] for tdref in referenceList.find_all('td', width='676'): refitem = tdref.a.get("href") refitem = refitem.strip() print(refitem) ref = ref + refitem + ' ,' # 获取作者单位,处理字符串匹配 authorUnitScope = soup.find('div', style='text-align:left;', class_='xx_font') author_unit = '' author_unit_text = authorUnitScope.get_text() # print(author_unit_text) if '【作者单位】:' in author_unit_text: auindex = author_unit_text.find('【作者单位】:', 0) else: auindex = author_unit_text.find('【学位授予单位】:', 0) for k in range(auindex, len(author_unit_text)): if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \ author_unit_text[k] == '】': continue if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ': continue if author_unit_text[k] != '【': author_unit = author_unit + author_unit_text[k] if author_unit_text[k] == '【' and k != auindex: break # 获取关键字 key_word = '' kwindex = author_unit_text.find('【关键词】:', 0) for k in range(kwindex, len(author_unit_text)): if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \ author_unit_text[k] == '】': continue if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ': continue if author_unit_text[k] != '【': key_word = key_word + author_unit_text[k] if author_unit_text[k] == '【' and k != kwindex: break # print(author_unit) # print(key_word) line = line.strip('\n') line = line + '\t' + str(author) + '\t' + str(author_unit) + '\t'+ str(key_word) + '\t'+ str(tstr) + '\t' + str(ref) + '\n' outstring = line.split('\t') for i in range(len(outstring)): sheet.write(lin_num, i, outstring[i]) print('写入第'+str(lin_num)+'行') lin_num += 1 wb.save('data_out_'+str(keyword)+'.xls') file.close() end = time.clock() print('Running time: %s Seconds' % (end - start))
from bs4 import BeautifulSoupimport urllibimport urllib.requestimport sysimport iosys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')page_num=15def get_paper_url(page_url): html = urllib.request.urlopen(page_url).read() soup = BeautifulSoup(html,'html.parser') f = open('data-detail.txt','a+', encoding='utf-8') all = soup.find_all('div', class_='wz_content') for string in all: item = string.find('a', target='_blank')#文章标题与链接 href = item.get('href')# 获取文章url title = item.get_text() # 获取文章标题 year_count = string.find('span', class_='year-count')#获取文章出处与引用次数 #year_count = year_count.get_text() publish = '' reference = '' for item in year_count: item = item.string item = item.replace('\n','') item = item.replace('\r', '') if '被引次数' in item: reference = item# 获取被引次数 elif '年' in item: # 获取文章出处 publish = item #print(publish) #print(reference) #print(year_count) f.write(href + '\t' + title + '\t' + publish + '\t' + reference +'\n') f.close()
阅读全文
0 0
- zhkmjj
- 05-树9 Huffman Codes
- centos安装maven
- 《完美设计模式》之责任链模式
- PN结的形成原理
- 首篇--spring使用@value注入静态变量
- zhkmjj
- Multiply Strings(大整数乘法)
- 夜神模拟器搭配JuiceSSH完成Shell编程环境
- 《Linux内核设计与实现》读书笔记(十七)- 设备与模块
- 《Linux内核设计与实现》读书笔记(十八)- 内核调试
- 《Linux内核设计与实现》读书笔记(十九)- 可移植性
- C语言魔王语言解释
- template template parameter模板模板参数笔记----C++学习之路
- 《Linux内核设计与实现》读书笔记(二十)- 补丁, 开发和社区