正则表达式(网页爬虫)

来源:互联网 发布:电子贺卡软件 编辑:程序博客网 时间:2024/05/22 13:13
package com.regex;import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/* * 网页爬虫:其实就是一个程序用于在互联网中获取符合指定 规定的数据 *  *   功能:爬取邮箱地址 */public class Demo1 {public static void main(String[] args) throws IOException {List<String> list = getMailsByWeb();for (String str : list) {System.out.println(str + " ");}}public static List<String> getMailsByWeb() throws IOException {// 1.读取文件URL url = new URL("http://127.0.0.1:8088/Test/index.html");BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));// 2.对读取的数据进行规则的匹配,从中获取符合规则的数据String mail_regex = "\\w+@\\w+(\\.\\w+)+";List<String> list = new ArrayList<String>();Pattern p = Pattern.compile(mail_regex);String line = null;while ((line = br.readLine()) != null) {Matcher m = p.matcher(line);while (m.find()) {// 3.将符合规则的数据存储到集合中list.add(m.group());}}return list;}public static List<String> getMails() throws IOException {// 1.读取文件BufferedReader br = new BufferedReader(new FileReader("D:\\index.html"));// 2.对读取的数据进行规则的匹配,从中获取符合规则的数据String mail_regex = "\\w+@\\w+(\\.\\w+)+";List<String> list = new ArrayList<String>();Pattern p = Pattern.compile(mail_regex);String line = null;while ((line = br.readLine()) != null) {Matcher m = p.matcher(line);while (m.find()) {// 3.将符合规则的数据存储到集合中list.add(m.group());}}return list;}}

原创粉丝点击