爬虫实战-获取NCBI网站上SNP位点信息

来源:互联网 发布:魔术教学知乎 编辑:程序博客网 时间:2024/05/16 10:14

啥是爬虫,是不是一种昆虫?

一 爬虫介绍

网页爬虫是按照一定规则,获取网页信息的程序,脚本。可以使用各种编程需要去写爬虫,例如java,python,perl等等。

二 主要工具

**

pycharm

python3.6

三 程序代码解析

规范程序代码的编写,必须写注释,还要补充一下内容

#!/usr/bin/python#-*-coding:utf8-*-#created: 2017-08-20__author__ = '孙成全'__doc__ = 'The subscript is used to xxx  '

1 使用程序反复的访问一个网站,服务器压力过大,会识别出这是不是非人访问,从而把你的电脑IP封存,导致爬取页面信息失败,这时我们就要使用到代理

在网上找到了一个免费,使用性较好的IP代理网站

http://lab.crossincode.com/proxy/

这里写图片描述
我们使用程序获取网页上的ip

分析网站源代码

这里写图片描述

class Get_ip():    '获取代理IP地址'    def url_open(self,url):        '打开连接,定义映射'        req = urllib.request.Request(url)        #模拟浏览器登陆网站        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36')        response =urllib.request.urlopen(url)        html = response.read().decode('utf-8')        return html def geteveryclass(self,source):        everyclass =re.findall('(<tr.*?</tr>)', source, re.S)        return everyclass def get_ip(self):        '获取IP'        url ='http://lab.crossincode.com/proxy/'        try:            source = self.url_open(url)        except Exception:            print('somthing wrong')        everyclass1 =self.geteveryclass(source)        everyclass = everyclass1[1:]        ip_list = []        for eachclass in everyclass:            a =re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',eachclass, re.S).group(1)#获取Addr            b =re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',eachclass, re.S).group(2)#获取port            c = a + ':' + b            ip_list.append(c)        return ip_list[:10] #前10个ip地址

2 获取NCBI上的SNP位点信息,读取excel表中snp位点号,并把查到信息存入表中,结果如下:
这里写图片描述

这里写图片描述

#!/usr/bin/python#-*-coding:utf8-*-#created: 2017-09-20__author__ = '孙成全'__doc__ = 'The subscript is used to 'import urllib.requestimport requestsimport reimport xlrd,xlsxwriterimport randomimport timedef Mysign():    '作者签名'    print('------The author\'s Sunchengquan------')    print('--------' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + '--------')Mysign()class Get_ip():    '获取代理IP地址'    def url_open(self,url):        '打开连接,定义映射'        req = urllib.request.Request(url)        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36')        response = urllib.request.urlopen(url)        html = response.read().decode('utf-8')        return html    def geteveryclass(self,source):        everyclass = re.findall('(<tr.*?</tr>)', source, re.S)        return everyclass    def get_ip(self):        '获取IP'        url = 'http://lab.crossincode.com/proxy/'        try:            source = self.url_open(url)        except Exception:            print('somthing wrong')        everyclass1 = self.geteveryclass(source)        everyclass = everyclass1[1:]        ip_list = []        for eachclass in everyclass:            a = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',                          eachclass, re.S).group(1)            b = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',                          eachclass, re.S).group(2)            # c = re.search('<tr>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>\n\s*?<td>(.*?)</td>',            #               eachclass, re.S).group(4)            # d = "'"+c.lower()+"'"            e = a + ':' + b            ip_list.append(e)        return (ip_list[:10])class NCBI():    def url_open(self,url,proxies=None,num_retries=6):        agent_list = [            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"        ]        ua = random.choice(agent_list)  # 从agent_list中随机抽取出一个字符串        header = {"User-Agent": ua}  # 构造一个完整的User_Agent        if proxies==None:            try:                html = requests.get(url, proxies=proxies, headers=header)                return html.text            except:                    time.sleep(3)                    print('获取网页错误')                    print('开始使用代理')                    ip_list = Get_ip().get_ip()                    IP = random.choice(ip_list)                    proxies = {'http': IP}                    return self.url_open(url,proxies)        else:            try:                ip_list = Get_ip().get_ip()                IP = random.choice(ip_list)                proxies = {'http':IP}                html = requests.get(url, proxies=proxies, headers=header)                return html.text            except:                if num_retries>0:                    time.sleep(3)                    IP = random.choice(ip_list)                    print("正在更换代理,3s后重新获取倒数第%d次"% num_retries)                    print('当前的代理是:', proxies)                    return self.url_open(url,proxies,num_retries-1)                else:                    print('代理发生错误,取消')                    return self.url_open(url)    def excel_open(self,name):        '读取excel内容'        fname = name        bk = xlrd.open_workbook(fname)        # shxrange = range(bk.nsheets)        try:            sh = bk.sheet_by_name("SNP")        except:            print("no sheet in %s named SNP" % fname)        nrows = sh.nrows        # ncols = sh.ncols        row_list = []        for i in range(0, nrows):            row_data = sh.cell_value(i, 0)            row_list.append(row_data)        return row_list    def get_url(self,name):        '获取连接地址'        snp_list = self.excel_open(name)        url_list = []        for i in snp_list:            url = 'https://www.ncbi.nlm.nih.gov/snp/?term=' + i            url_list.append(url)        return url_list    def main(self):        '爬取内容后,存入excel'        url1 = self.get_url('SNP.xlsx')        SNP_list = self.excel_open('SNP.xlsx')        squence_list = []        a = 0        b = 0        workbook = xlsxwriter.Workbook('wegene.xlsx')        worksheet = workbook.add_worksheet('SNP')        try:            for i in url1:                try:                    source = self.url_open(i)                    time.sleep(2)                    globalmaf = re.findall('Global MAF:</a></dt><dd>(.*?)\n</dd>', source, re.S)                    if len(globalmaf):                        worksheet.write(b, 3, globalmaf[0])                    else:                        worksheet.write(b, 3, 'None')                    chromosome = re.findall('<dt>Chromosome: </dt><dd>(.*?)<br', source, re.S)                    if len(chromosome):                        worksheet.write(b, 1, chromosome[0])                    else:                        worksheet.write(b, 1, 'None')                    sequence1 = re.findall('class="snp_flanks">(.*?)<span', source, re.S)                    sequence2 = re.findall('style="color:red">(.*?)</span>', source, re.S)                    try:                        sequence3 = re.search('style="color:red">(.*?)</span>(.*?)&#13;</pre>', source, re.S).group(2)                        sequence = sequence1[0] + sequence2[0] + sequence3                        worksheet.write(b, 2, sequence)                    except:                        worksheet.write(b, 2, 'None')                except:                    print('somthing wrong')                finally:                    a = a + 1                    print('正在爬取第', a, '条', ':', i)                    if SNP_list[b]:                        worksheet.write(b, 0, SNP_list[b])                    else:                        worksheet.write(b, 0, 'None')                          b += 1        finally:            workbook.close()if __name__ == '__main__':    NCBI().main()
原创粉丝点击