Python爬取微软学术保存数据库(代码粗略有待完善~)
来源:互联网 发布:js 把内容动态写入div 编辑:程序博客网 时间:2024/06/05 20:09
# -*- coding:utf-8 -*-
# 阳耿先生的第一个博客import urllibimport urllib2import jsonimport sslimport timeimport MySQLdbclass Getinfo(): def __init__(self): ssl._create_default_https_context = ssl._create_unverified_context self.url = "https://academic.microsoft.com/api/search/GetEntityResults?query=And(Ty=%270%27,Composite(AA.AuId=57053632))&filters=Composite(AA.AuId=254766753)&offset=0&limit=8&correlationId=07bba304-338832b7-638a-a415415d" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"} self.num = 1 req = urllib2.Request(self.url, headers=self.headers) html = urllib2.urlopen(req, timeout=20) self.data = json.load(html) def get_intro(self): data = self.data author = data['entitiesInQuery'][0]['entityTitle'] auID = data['entitiesInQuery'][0]['entity']['id'] univ = data['entitiesInQuery'][0]['lastKnownAffiliation']['lt'] photo = data['entitiesInQuery'][0]['image'] intro = data['entitiesInQuery'][0]['description'] print 'name: '+author print 'authorID: %d'%auID print 'college: '+univ print 'photo: '+photo print 'introduction: '+intro try: db = MySQLdb.connect("localhost", "root", "root", "robert tibshirani") cursor = db.cursor() sql = """CREATE TABLE intro ( NAME VARCHAR(100) , COLLEGE VARCHAR(100) , AUID INT primary key , PHOTO MESSAGE_TEXT , INTRODUCTION MESSAGE_TEXT )""" cursor.execute(sql) cursor.close() db.close() except: pass try: db = MySQLdb.connect("localhost", "root", "root", "robert tibshirani") cursor = db.cursor() sql = "INSERT INTO intro(NAME,COLLEGE,AUID,PHOTO,INTRODUCTION)VALUES ('%s','%s','%d','%s','%s')"%(author,univ,auID, photo,intro) cursor.execute(sql) db.commit() cursor.close() db.close() except: pass # except: # print '看一下为什么写不进去' print '--------------------------------------' def get_first_page(self): data = self.data for i in range(0,7): paper_i_ID = data['results'][i]['id'] paper_i_quote = data['results'][i]['cc'] paper_i_time = data['results'][i]['d'].encode('gbk') try: paper_i_title = eval(data['results'][i]['e'])['DN'] print 'paper_%d_title: '%self.num+paper_i_title except: print 'No title' print '文章ID: %d' %paper_i_ID print '发表时间: %s' % paper_i_time print '引用量: %d' % paper_i_quote try: paper_i_content = eval(data['results'][i]['e'])['D'] print 'paper_%d_content: '%self.num+paper_i_content except: print 'No content' self.num+=1 try: db = MySQLdb.connect("localhost", "root", "root", "robert tibshirani") cursor = db.cursor() sql_paper = "INSERT INTO papers(TITLE,PAID,PUBLIC,CITED,CONTENT)VALUES ('%s','%d','%s','%d','%s')"%(paper_i_title, paper_i_ID, paper_i_time, paper_i_quote, paper_i_content) cursor.execute(sql_paper) db.commit() cursor.close() db.close() except: print 'papers_%s 此处存入失败'%self.num pass print '--------------------------------------' def get_single_page(self,i, req): html = urllib2.urlopen(req, timeout=20) data = json.load(html) paper_i_ID = data['results'][i]['id'] paper_i_quote = data['results'][i]['cc'] paper_i_time = data['results'][i]['d'].encode('gbk') try: paper_i_title = eval(data['results'][i]['e'])['DN'] print 'paper_%d_title: ' % self.num + paper_i_title except: print 'No title' print '文章ID: %d' % paper_i_ID print '发表时间: %s' % paper_i_time print '引用量: %d' % paper_i_quote try: paper_i_content = eval(data['results'][i]['e'])['D'] print 'paper_%d_content: ' % self.num + paper_i_content except: print 'No content' self.num += 1 try: db = MySQLdb.connect("localhost", "root", "root", "robert tibshirani") cursor = db.cursor() sql_paper = "INSERT INTO papers(TITLE,PAID,PUBLIC,CITED,CONTENT)VALUES ('%s','%d','%s','%d','%s')" % ( paper_i_title, paper_i_ID, paper_i_time, paper_i_quote, paper_i_content) cursor.execute(sql_paper) db.commit() cursor.close() db.close() except: print 'papers_%s 此处存入失败' % self.num pass print '--------------------------------------' def get_other_page(self): for j in range(1,15): for i in range(0,7): url = 'https://academic.microsoft.com/api/search/GetEntityResults?query=And(Ty=%270%27,Composite(AA.AuId=57053632))&filters=Composite(AA.AuId=254766753)&offset='+'%d'%(j*8)+'&limit=8&correlationId=07bba304-338832b7-638a-a415415d' req = urllib2.Request(url, headers=self.headers) try: get.get_single_page(i,req) except: time.sleep(5) get.get_single_page(i,req)if __name__ == '__main__': get = Getinfo() get.get_intro() get.get_first_page() get.get_other_page()
阅读全文
1 0
- Python爬取微软学术保存数据库(代码粗略有待完善~)
- python爬取返利网(完善)
- javascript 取form元素、form元素中的input元素值(有待完善)
- javascript扫雷游戏示例(有待完善)
- 服务器配置(有待发现并完善)
- 常用命令(有待完善好的命令)
- 骗子电话大全(有待完善)
- ffmpeg 收集 (刚开始,有待完善)
- 车头车尾识别(有待完善)
- #python学习笔记#使用python爬取网站数据并保存到数据库
- J2ME--显示手机通讯录记录(有待完善)
- Delete Node in a Linked List(有待完善)
- 电子技术书籍(日本)——有待补充完善
- python尝试从通联数据爬取信息并保存在mongodb数据库中
- Python爬虫-利用百度地图API接口爬取数据并保存至MySQL数据库
- 有待完善思路的题解
- python爬取视频代码
- 数据保存!!!Python 爬取网页数据后,三种保存格式---保存为txt文件、CSV文件和mysql数据库
- maven-resources-plugin 一般配置在 build>plugins
- [伪·ZOJ2314] 无源汇有上下界的最大流
- PHP学习笔记——新特性匿名函数
- MooFest
- PowerDesigner15/16中 Inheritance、Association、Association Link无法使用的解决方法
- Python爬取微软学术保存数据库(代码粗略有待完善~)
- 网络编程--字节序
- 剑指offer 数组中只出现一次的数字
- Redis 缓存 + Spring 的集成示例
- 创建表和添加索引
- ORACLE RAC 11.2.0.4 一节点出现Suspending MMON slave action kewrmrfsa_ for 82800 seconds
- ubuntu14.04 输入用户账号密码,无法进入桌面
- shell 脚本之 cut awk sed 命令详解
- 把字符串转换成整数(Java实现)