python最简单的爬取邮箱地址

来源:互联网 发布:黎明杀机mac能不能玩 编辑:程序博客网 时间:2024/05/16 19:17
http://www.jb51.net/article/57161.htm
#!/usr/bin/env python#-*- coding:utf-8 -*-import reimport sysdef getIPAddFromFile(fobj):    regex = re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b', re.IGNORECASE)    ipadds = re.findall(regex, fobj)    print ipadds    return ipaddsdef getPhoneNumFromFile(fobj):    regex = re.compile(r'1\d{10}', re.IGNORECASE)    phonenums = re.findall(regex, fobj)    print phonenums    return phonenumsdef getMailAddFromFile(fobj):    regex = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b", re.IGNORECASE)    mails = re.findall(regex, fobj)    print mails    return mailsdef getUrlFromFile(fobj):    regex = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.IGNORECASE)    urls = regex.findall(fobj)    print urls    return urlsdef main(FilefilePath):    fobj = open(FilefilePath, 'rb').read()    urllist = getUrlFromFile(fobj)    mailList = getMailAddFromFile(fobj)    phoneNum = getPhoneNumFromFile(fobj)    ipaddlist = getIPAddFromFile(fobj)if __name__ == '__main__':    main(sys.argv[1])
</pre><pre name="code" class="python">
</pre><pre name="code" class="python">
# -*- coding: utf-8 -*- import reimport urllibdef getHtml(url):    page = urllib.urlopen(url)    html = page.read()    return htmldef getImg(html):    reg = r'src="(.+?\.jpg)" pic_ext'    #p=re.compile('[^\._-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+$|^0\d{2,3}\d{7,8}$|^1[358]\d{9}$|^147\d{8}')    regex = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b", re.IGNORECASE)            imgre = re.compile(regex)    imglist = re.findall(regex,html)    print imglist    return imglist       #x=0    #for imgurl in imglist:        #urllib.urlretrieve(imgurl,'%s.jpg' % x)         #x=x+1   html = getHtml("http://tieba.baidu.com/p/3827945043")print getImg(html)

0 0
原创粉丝点击