网页爬虫--从网页中获取邮箱地址

来源:互联网 发布:5g网络基础硬件建设 编辑:程序博客网 时间:2024/05/17 01:09
/*网页爬虫(蜘蛛):获取指定文档或者网页中的邮件地址使用获取功能,Pattern   Matcher*/import java.net.*;import java.io.*;import java.util.regex.*;class WebCrawler {public static void main(String[] args) throws Exception {//getMail_01();getMail_02();}//从指定网页中获取--------------------------------------------------------public static void getMail_02() throws Exception {//String path = "http://172.16.41.154:8080/myweb/demo.html";String path = "http://tieba.baidu.com/p/1664627884";URL url = new URL(path);URLConnection conn = url.openConnection();BufferedReader bufr = new BufferedReader(new InputStreamReader(conn.getInputStream()));PrintWriter pw = new PrintWriter(new FileWriter("mail.txt"), true);String line = null;String regex = "\\w+@\\w+(\\.[a-zA-Z]+)+";//匹配邮箱地址Pattern p = Pattern.compile(regex);//把正则封装成对象while((line=bufr.readLine()) != null) {//System.out.println(line);Matcher m = p.matcher(line);while(m.find()) {//System.out.println(m.group());//pw.println(m.group());//把获取到的邮箱存到文件中pw.print(m.group()+ ";");//把获取到的邮箱存到文件中}}pw.close();System.out.println(".....提取完毕");}//从本地文件中读获取--------------------------------------------------------public static void getMail_01() throws Exception {BufferedReader bufr = new BufferedReader(new FileReader("unknown.txt"));PrintWriter pw = new PrintWriter(new FileWriter("mail.txt"), true);String line = null;String regex = "\\w+@\\w+(\\.[a-zA-Z]+)+";//匹配邮箱地址Pattern p = Pattern.compile(regex);//把正则封装成对象while((line=bufr.readLine()) != null) {//System.out.println(line);Matcher m = p.matcher(line);while(m.find()) {//System.out.println(m.group());pw.println(m.group());//把获取到的邮箱存到文件中}}pw.close();System.out.println(".....提取完毕");}}

原创粉丝点击