自己写的一个针对特定网站的网络爬虫,初学者,大家瞧瞧,不好的地方,给点建议!谢谢!

来源:互联网 发布:淘宝店铺月销量减少 编辑:程序博客网 时间:2024/05/16 10:53

以下是全部代码 

恩其中还 测试了 log4j在非web项目也可以使用(需要log4j的配置文件,log4j.properties或者log4j.xml)

在代码中加载配置文件 获得logger即可详细代码如下

package net.rytong.myspider;import java.io.File;import java.io.FileOutputStream;import java.io.OutputStream;import java.util.HashSet;import java.util.Set;import org.apache.log4j.Logger;import org.apache.log4j.PropertyConfigurator;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.NodeList;/** *  * @author zhou_dong * 下载鬼吹灯的电子书所用的网络爬虫 * */public class MyHtmlParser {//访问过的linkprivate static Set<String> vistedLinks = new HashSet<String>();//private static Integer failureCount = 0;//判断是否已经访问过了private static boolean judgeUrl(String url){boolean flag = false;if(vistedLinks != null && vistedLinks.size() > 0){if(vistedLinks.contains(url)){flag = true;}}return flag;}//加载lo4j文件public static Logger loadLog4j(){PropertyConfigurator.configure("E:/rytong/myeclipsework/my_spider/config/log4j.properties");final Logger logger = Logger.getLogger("");return logger;}public static void main(String[] args)  {String url = "http://www.bxwx.org/b/3/3870/";//获得鬼吹灯的所有的urlfinal Set<String> aUrl = getAUrl(url);System.out.println("总url数:"+aUrl.size());//多线程启动myThread(aUrl,5); }private static void myThread(final Set<String> aUrl,Integer count) {for( int i=0;i<count;i++){Thread thread=new Thread(new Runnable(){            public void run() {                 while(true){                     try {                         synchronized (aUrl) {                         Thread.sleep(500);                        //用多个线程实现数据的抓取                myImportTxt(aUrl);                //System.out.println("线程"+num+"--->"+Thread.currentThread().getName()+":已启动");                }                    } catch (InterruptedException e) {                         e.printStackTrace();                         continue;                    }                 }             }         }, "Thread"+i);thread.start();// System.out.println("线程"+i+":"+Thread.currentThread().getName()+":启动");       }}//读取每一章节的内容,并保存到文本private static void myImportTxt(Set<String> aUrl) {//获得获取div的id为content的内容的过滤器HasAttributeFilter filter = new HasAttributeFilter("id", "content");//设置div的过滤器OrFilter divContext = new OrFilter(new NodeClassFilter(TitleTag.class),filter);OutputStream output = null;//PrintWriter pw = null;//InputStream input = null;int y = 1;//for(int num=0;num<aUrl.size();num++)for(String myUrl:aUrl){//String myUrl = aUrl.get(num);//判断url是否已经被使用过就不读取了if(judgeUrl(myUrl)){//aUrl.remove(myUrl);System.out.println("myUrl:"+myUrl+"已移除");break;}//System.out.println("小说章节的url:---->"+myUrl);try {Parser parser = new Parser(myUrl);parser.setEncoding("gb2312");//获得所有符合id为content的div的标签集合NodeList list = parser.extractAllNodesThatMatch(divContext);StringBuffer text = new StringBuffer();for (int i = 0; i < list.size(); i++){//获得标签的内容text = text.append(list.elementAt(i).toPlainTextString() + "\r\n");}String myText = text.toString();//System.out.println(myText);byte[] bytes = new byte[1024];bytes = myText.getBytes();long currentTimeMillis = System.currentTimeMillis();String bookTxt  = "E:/rytong/mytext/test4/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt";output = new FileOutputStream(new File(bookTxt));//pw = new PrintWriter(output,true);//pw.write(myText);output.write(bytes, 0, bytes.length);output.flush();//System.out.println("被读取的url:"+myUrl);//System.out.println("E:/rytong/mytext/test3/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt"+"导入成功");//System.out.println(bookTxt+"导入成功");//log4j的获得及使用Logger myLogger = loadLog4j();myLogger.info(bookTxt+"导入成功");y++;//把访问过的url添加到vistedLinksvistedLinks.add(myUrl);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();failureCount++;System.out.println(myUrl);continue;}}/*//pw.close();try {//output.close();} catch (IOException e) {e.printStackTrace();}*/}//获取所有的url路径@SuppressWarnings("serial")public static Set<String> getAUrl(String url){Set<String> myUrls = new HashSet<String>();try {//解析一个urlParser parser = new Parser(url);//设置编码parser.setEncoding("gb2312");//过滤<frame>标签的filter  在本文中没有使用到 只是给大家 举个例子 说明filter的用法。NodeFilter frameFilter = new NodeFilter() {public boolean accept(Node node) {String text = node.getText();//System.out.println("frame 标签的"+text);if(text.startsWith("frame src = ")){return true;}else{return false;}}};//OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);//获得所有匹配的urlNodeList nodeList = parser.extractAllNodesThatMatch(linkFilter);for(int i=0;i<nodeList.size();i++){Node readNode = nodeList.elementAt(i);if(readNode instanceof LinkTag){//获得所用的a标签String link = ((LinkTag) readNode).getLink();myUrls.add(link);}}} catch (Exception e) {e.printStackTrace();}return myUrls;}}



1 0
原创粉丝点击