爬虫实战-获取NCBI网站上SNP位点信息
来源:互联网 发布:魔术教学知乎 编辑:程序博客网 时间:2024/05/16 10:14
啥是爬虫,是不是一种昆虫?
一 爬虫介绍
网页爬虫是按照一定规则,获取网页信息的程序,脚本。可以使用各种编程需要去写爬虫,例如java,python,perl等等。
二 主要工具
**
pycharm
python3.6
三 程序代码解析
规范程序代码的编写,必须写注释,还要补充一下内容
#!/usr/bin/python#-*-coding:utf8-*-#created: 2017-08-20__author__ = '孙成全'__doc__ = 'The subscript is used to xxx '
1 使用程序反复的访问一个网站,服务器压力过大,会识别出这是不是非人访问,从而把你的电脑IP封存,导致爬取页面信息失败,这时我们就要使用到代理
在网上找到了一个免费,使用性较好的IP代理网站
http://lab.crossincode.com/proxy/
我们使用程序获取网页上的ip
分析网站源代码
class Get_ip(): '获取代理IP地址' def url_open(self,url): '打开连接,定义映射' req = urllib.request.Request(url) #模拟浏览器登陆网站 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36') response =urllib.request.urlopen(url) html = response.read().decode('utf-8') return html def geteveryclass(self,source): everyclass =re.findall('(<tr.*?</tr>)', source, re.S) return everyclass def get_ip(self): '获取IP' url ='http://lab.crossincode.com/proxy/' try: source = self.url_open(url) except Exception: print('somthing wrong') everyclass1 =self.geteveryclass(source) everyclass = everyclass1[1:] ip_list = [] for eachclass in everyclass: a =re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',eachclass, re.S).group(1)#获取Addr b =re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',eachclass, re.S).group(2)#获取port c = a + ':' + b ip_list.append(c) return ip_list[:10] #前10个ip地址
2 获取NCBI上的SNP位点信息,读取excel表中snp位点号,并把查到信息存入表中,结果如下:
#!/usr/bin/python#-*-coding:utf8-*-#created: 2017-09-20__author__ = '孙成全'__doc__ = 'The subscript is used to 'import urllib.requestimport requestsimport reimport xlrd,xlsxwriterimport randomimport timedef Mysign(): '作者签名' print('------The author\'s Sunchengquan------') print('--------' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + '--------')Mysign()class Get_ip(): '获取代理IP地址' def url_open(self,url): '打开连接,定义映射' req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36') response = urllib.request.urlopen(url) html = response.read().decode('utf-8') return html def geteveryclass(self,source): everyclass = re.findall('(<tr.*?</tr>)', source, re.S) return everyclass def get_ip(self): '获取IP' url = 'http://lab.crossincode.com/proxy/' try: source = self.url_open(url) except Exception: print('somthing wrong') everyclass1 = self.geteveryclass(source) everyclass = everyclass1[1:] ip_list = [] for eachclass in everyclass: a = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>', eachclass, re.S).group(1) b = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>', eachclass, re.S).group(2) # c = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>', # eachclass, re.S).group(4) # d = "'"+c.lower()+"'" e = a + ':' + b ip_list.append(e) return (ip_list[:10])class NCBI(): def url_open(self,url,proxies=None,num_retries=6): agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] ua = random.choice(agent_list) # 从agent_list中随机抽取出一个字符串 header = {"User-Agent": ua} # 构造一个完整的User_Agent if proxies==None: try: html = requests.get(url, proxies=proxies, headers=header) return html.text except: time.sleep(3) print('获取网页错误') print('开始使用代理') ip_list = Get_ip().get_ip() IP = random.choice(ip_list) proxies = {'http': IP} return self.url_open(url,proxies) else: try: ip_list = Get_ip().get_ip() IP = random.choice(ip_list) proxies = {'http':IP} html = requests.get(url, proxies=proxies, headers=header) return html.text except: if num_retries>0: time.sleep(3) IP = random.choice(ip_list) print("正在更换代理,3s后重新获取倒数第%d次"% num_retries) print('当前的代理是:', proxies) return self.url_open(url,proxies,num_retries-1) else: print('代理发生错误,取消') return self.url_open(url) def excel_open(self,name): '读取excel内容' fname = name bk = xlrd.open_workbook(fname) # shxrange = range(bk.nsheets) try: sh = bk.sheet_by_name("SNP") except: print("no sheet in %s named SNP" % fname) nrows = sh.nrows # ncols = sh.ncols row_list = [] for i in range(0, nrows): row_data = sh.cell_value(i, 0) row_list.append(row_data) return row_list def get_url(self,name): '获取连接地址' snp_list = self.excel_open(name) url_list = [] for i in snp_list: url = 'https://www.ncbi.nlm.nih.gov/snp/?term=' + i url_list.append(url) return url_list def main(self): '爬取内容后,存入excel' url1 = self.get_url('SNP.xlsx') SNP_list = self.excel_open('SNP.xlsx') squence_list = [] a = 0 b = 0 workbook = xlsxwriter.Workbook('wegene.xlsx') worksheet = workbook.add_worksheet('SNP') try: for i in url1: try: source = self.url_open(i) time.sleep(2) globalmaf = re.findall('Global MAF:</a></dt><dd>(.*?)\n</dd>', source, re.S) if len(globalmaf): worksheet.write(b, 3, globalmaf[0]) else: worksheet.write(b, 3, 'None') chromosome = re.findall('<dt>Chromosome: </dt><dd>(.*?)<br', source, re.S) if len(chromosome): worksheet.write(b, 1, chromosome[0]) else: worksheet.write(b, 1, 'None') sequence1 = re.findall('class="snp_flanks">(.*?)<span', source, re.S) sequence2 = re.findall('style="color:red">(.*?)</span>', source, re.S) try: sequence3 = re.search('style="color:red">(.*?)</span>(.*?) </pre>', source, re.S).group(2) sequence = sequence1[0] + sequence2[0] + sequence3 worksheet.write(b, 2, sequence) except: worksheet.write(b, 2, 'None') except: print('somthing wrong') finally: a = a + 1 print('正在爬取第', a, '条', ':', i) if SNP_list[b]: worksheet.write(b, 0, SNP_list[b]) else: worksheet.write(b, 0, 'None') b += 1 finally: workbook.close()if __name__ == '__main__': NCBI().main()
阅读全文
0 0
- 爬虫实战-获取NCBI网站上SNP位点信息
- 爬虫实战-获取wegene网站上SNP位点信息
- perl应用:从NCBI提供的信息中获取需要的序列(上)
- perl应用:从NCBI提供的信息中获取需要的序列(上)use Arrays
- Scrapy爬虫实战二:获取天气信息
- python 爬虫获取网站信息(一)
- python 爬虫获取网站信息(二)
- python3.x爬虫实战:阿里巴巴网站定向信息抓取
- perl应用:SNP的提取(2):从对比序列中找到SNP位点并输出 a.pl
- python 爬虫实战--登陆学校教务系统获取成绩信息
- Scrapy爬虫实战一:获取中影国际影城信息
- Web服务获取网站上的信息
- Rcurl抓取NCBI信息
- SNP
- SNP
- Python3爬虫实战——QQ空间自动点赞程序(上)
- Python爬虫爬取美剧网站信息
- 【Python爬虫】了解网站信息
- C语言链表的来源分析
- CentOS7 源码安装 MySQL 5.6
- Qt记录应用日志
- IDEA开发工具整合ssm+maven基本配置
- Mac OS X 上修改主机名
- 爬虫实战-获取NCBI网站上SNP位点信息
- HashMap的工作原理
- 创建一个名为myinput.java的程序:包含从键盘读取 int、double、float、boolean、short、byte和string值的方法
- uva 1121
- response.setHeader()的用法详解及实现文件下载过程中出现问题的分析
- MySQL 数据库表格修改
- 理解SolrCloud的事务日志、软提交和硬提交
- knockout with绑定提示未定义的解决
- opencv学习中——图片类型转换