java 学习:网络爬虫--中国人才热线邮箱抓取

来源:互联网 发布:夏米d5支持电信网络吗 编辑:程序博客网 时间:2024/04/29 20:04

很简陋的一个抓取邮箱的,抓取效率很低,纯当熟悉键盘。

1. 函数入口

public class Test01 {/** * @param args * @throws IOException */public static void main(String[] args)  {for (int i=1; i<=20; i++) {HtmlPage h1 = new HtmlPage(    "http://www.cjol.com/search/l2008/"+i+"/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43",1);h1.pageCode();//new Thread(new mRunable(h1), ""+i).start();}//http://www.cjol.com/search/l2008/4/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43//h1.email();//h1.pageCode();// h1.email();}}

2. 正则表达式

public class Regx {/** * 搜索业务员找公司页面 pat = "href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>" *  * @param buf * @throws IOException */public synchronized static void findCompany(String buf) {List<String> companyList = new ArrayList<String>();// System.out.println("resource:"+buf);// System.out.println("findCompany()");Pattern pattern = Pattern // \\s*target=\"_blank\".compile("href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>");Matcher matcher = pattern.matcher(buf);Pattern innerPattern = Pattern.compile("http:\\S+\"");while (matcher.find()) {String string = matcher.group();// System.out.println(string);Matcher innerMatcher = innerPattern.matcher(string);if (innerMatcher.find()) {String tmp = innerMatcher.group().replaceAll("\"", "");new HtmlPage(tmp, 2);String ttString = "公司招聘页面地址:" + tmp;System.out.println(ttString);HtmlPage.writLog(ttString);}companyList.add(string);}}/** * 找到公司官网主页地址 *  * @param buf * @throws IOException */public synchronized static void findWebSite(String buf) {List<String> webSiteList = new ArrayList<String>();Pattern pattern = Pattern.compile("网址:<a href=\"http://[\\w-\\./]+\"");Matcher matcher = pattern.matcher(buf);Pattern innerPattern = Pattern.compile("http:\\S+\"");while (matcher.find()) {String string = matcher.group();// System.out.println("找到啦:"+string);Matcher innerMatcher = innerPattern.matcher(string);if (innerMatcher.find()) {String tmp = innerMatcher.group().replaceAll("\"", "");String ttsString = "公司主页地址:" + tmp;System.out.println(ttsString);HtmlPage.writLog(ttsString);new HtmlPage(tmp, 3);}webSiteList.add(string);}}/** * 在官网主页找 联系我们/contact us <a href="contactus.asp">联系方式</a> * href="contact.php">CONTACT US</a> *  * @param str * @throws IOException */public synchronized static void findContanct(String url, String str) {List<String> webSiteList = new ArrayList<String>();Pattern pattern = Pattern.compile("href=\"[\\w-\\./\\?=]+\">contact",Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(str);Pattern innerPattern = Pattern.compile("\".+\"");while (matcher.find()) {String string = matcher.group();String ttsString = "联系方式地址:" + string;System.out.println(ttsString);HtmlPage.writLog(ttsString);Matcher innerMatcher = innerPattern.matcher(string);if (innerMatcher.find()) {String tmp = innerMatcher.group().replaceAll("\"", "");new HtmlPage(url + "//" + tmp, 4);}webSiteList.add(string);}}/** * 邮箱地址验证 *  * @param str * @return */public synchronized static List<String> email(String str) {File file = new File("1.txt");RandomAccessFile rd = null;try {rd = new RandomAccessFile(file, "rw");rd.seek(file.length());} catch (IOException e1) {// TODO Auto-generated catch blocke1.printStackTrace();}Pattern pattern = Pattern.compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}");Matcher matcher = pattern.matcher(str);List<String> list = new ArrayList<String>();while (matcher.find()) {String reString = matcher.group();if (HtmlPage.putEmail(reString)) {System.out.println("邮箱:------------------------------------------------------------------------- "+ reString + "---------------");HtmlPage.writLog("邮箱: "+reString);try {rd.write(reString.getBytes());rd.write("\r\n".getBytes());} catch (IOException e) {HtmlPage.writLog(reString+" 写邮箱失败:"+e.getMessage());System.out.println("邮箱写入失败:"+e.getMessage());e.printStackTrace();} finally {try {if (rd!= null) rd.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}list.add(reString);return list;}}return null;}}

3. 页面内容抓取

public class HtmlPage {private String spec;private int depth;// private String pageCode;private static List<String> emailList = new ArrayList<String>();public HtmlPage(String urlString, int depth)  {this.spec = urlString;this.depth = depth;System.out.println("---------"+urlString +"----"+ depth);HtmlPage.writLog("---------"+urlString +"----"+ depth);if (depth !=1)pageCode();}public void pageCode()  {URL url = null;try {url = new URL(spec);} catch (MalformedURLException e) {HtmlPage.writLog(spec+" 初始化失败:"+e.getMessage());System.out.println("url初始化失败");e.printStackTrace();return;}StringBuffer sBuffer = new StringBuffer();HttpURLConnection connection;try {connection = (HttpURLConnection) url.openConnection();connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");} catch (IOException e) {HtmlPage.writLog(spec+" 打开网址失败:"+e.getMessage());System.out.println("打开网址失败");e.printStackTrace();return;}connection.setDoOutput(true);// 网页编码//String charset = getCharset(connection.getContentType());BufferedReader br = null;try {br = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));} catch (UnsupportedEncodingException e) {HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());e.printStackTrace();return;} catch (IOException e) {HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());e.printStackTrace();return;}String str = null;try {while ((str = br.readLine()) != null) {switch (depth) {case 1:Regx.findCompany(str);break;case 2:Regx.findWebSite(str);break;case 3:Regx.findContanct(spec, str);Regx.email(str);break;case 4:Regx.email(str);break;default:break;}}} catch (IOException e) {HtmlPage.writLog(spec+" 读取输入流:"+e.getMessage());System.out.println(e.getMessage());//e.printStackTrace();return;}}/** * 网页编码 *  * @param contentType * @return */private String getCharset(String contentType) {if (contentType == null)return "gbk";Pattern pattern = Pattern.compile("charset=.*");Matcher matcher = pattern.matcher(contentType);if (matcher.find())return matcher.group(0).split("charset=")[1];return "gbk";}public synchronized static boolean putEmail(String str) {if (!emailList.contains(str)) {emailList.add(str);return true;}return false;}public synchronized static void writLog(String str) {File file = new File("log.txt");RandomAccessFile rd = null;try { rd = new RandomAccessFile(file, "rw");int len = (int) file.length();rd.seek(len);rd.write(str.getBytes());rd.write("\r\n".getBytes());} catch (FileNotFoundException e) {System.out.println("日志写入失败!");e.printStackTrace();} catch (IOException e) {System.out.println("日志写入失败!");e.printStackTrace();} finally {try {if (rd!= null) rd.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}public void email() {int len = emailList.size();for (int i=0; i<len; i++) {System.out.println(emailList.get(i));}}}


4. 多线程

public class mRunable implements Runnable {private HtmlPage htmlPage;public mRunable() {}public mRunable(HtmlPage htmlPage) {this.htmlPage = htmlPage;}@Overridepublic void run() {System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"滴滴开始了啦----------\n\n\n");HtmlPage.writLog("线程"+Thread.currentThread().getName()+"开始运行");htmlPage.pageCode();System.out.println("\n\n线程----------------------------------------------------  ----"+Thread.currentThread().getName() +"完成工作啦----------\n\n\n");HtmlPage.writLog("线程"+Thread.currentThread().getName()+"运行结束");}}


0 0
原创粉丝点击