博客首页博文截取列表

来源:互联网 发布:平价好用的面霜知乎 编辑:程序博客网 时间:2024/04/30 06:07

1 在sql语句中运用 substring()方法将包含博文字段内容截取部分;

2 运用htmlparser 处理截取的内容;

3 运用htmlparser补齐标签;

 

代码如下:

package cn.blog.parser;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;

public class ParserTest {

 private static final Logger logger = Logger.getLogger(Parser.class);

 public static String readHtml(String url) {

  try {
   Parser parser = new Parser();
   parser.setURL(url);
   parser.setEncoding(parser.getEncoding());
   NodeVisitor visitor = new NodeVisitor() {
    public void visitTag(Tag tag) {
     logger.fatal("testVisitorAll()  Tag name is :"
       + tag.getTagName() + " /n Class is :"
       + tag.getClass());
    }
   };
   parser.visitAllNodesWith(visitor);

  } catch (ParserException e) {
   e.printStackTrace();
  }

  return "";
 }

 public static String readWithTag(String url, int len) throws IOException {

  // java.io.FileReader fred = new FileReader(url);
  FileInputStream fin = new FileInputStream(url);
  byte[] bb = new byte[fin.available()];
  fin.read(bb);
  String content = new String(bb);
  fin.close();
  return content.substring(0, content.length() > len ? len : content
    .length());
 }

 public static void writeWithTag(String content, String url)
   throws IOException {
  FileOutputStream fou = new FileOutputStream(url);
  byte[] bb = content.getBytes();
  fou.write(bb);

  fou.close();

 }

 public static String subcontent(String content) {
  try {

   content = endTagValidate(content);
   Parser parser = Parser.createParser(content, "utf-8");
   NodeList list = parser.parse(null);
   //处理页面内容
   visitNodeList(list);
   content = list.toHtml();
   //补齐标签
   return addEndTag(content);

  } catch (Exception e) {
   return "";
  }

 }
 //处理截取完后的最后标签
 private static String endTagValidate(String content)
 {
  int end = content.lastIndexOf("<");
  String ss = content.substring(end, content.length()).toLowerCase();
  //"" 替换成视频标签
  if (ss.startsWith("img") || ss.startsWith(""))
  {
   if (ss.indexOf(">") == -1) {
    content = content.substring(0, end);
   }
  }
  return content;
 }

 /**
  * 补齐标签的结尾
  * @param html
  * @return
  * @throws UnsupportedEncodingException
  * @throws ParserException
  */
 private static String addEndTag(String html)
   throws UnsupportedEncodingException, ParserException {
  Parser parser = Parser.createParser(html, "utf-8");
  NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
   public boolean accept(Node node) {
    if (node instanceof CompositeTag) {
     return true;
    }
    return false;
   }
  });
  String str = "";
  String tmp = "";
  for (int i = 0; i < nodelist.size(); i++) {
   CompositeTag testTag = (CompositeTag) nodelist.elementAt(i);
   if (testTag.getParent() == null) {
    // 记住这里只需循环第一层就能帮你补齐的了
    tmp =  testTag.toHtml();
    str += tmp + "/n";
   }
  }
  return str + "...";
 }

 /**
  * 处理html的内容 (去除script、 将图片的高和宽限制了)
  *
  * @param list
  */
 private static void visitNodeList(NodeList list) {
  for (int i = 0; i < list.size(); i++) {
   Node node = list.elementAt(i);
   if (node instanceof ScriptTag) {
    list.remove(i);
    continue;
   }
   if (node instanceof ImageTag) {
    ImageTag img = (ImageTag) node;
    //限制图片的长宽
    img.setAttribute("width", "/"100/"");
    img.setAttribute("height", "/"75/"");
   }
   NodeList children = node.getChildren();
   if (children != null && children.size() > 0) {
    visitNodeList(children);
   }
  }
 }

 /**
  * @param args
  */
 public static void main(String[] args) {
  // TODO Auto-generated method stub
//  readHtml("http://www.google.com");
  try {
//   String content = readWithTag("D://htmlParser//y1.txt", 40000);
   String content = "<html><head><title>asdf</title></head><body>nihaodcesljk啊是的拉快点放假啊两节课";
   System.out.println(content);
   System.out.println(subcontent(content));
//   writeWithTag(subcontent(content),
//     "D://htmlParser//1.html");

  } catch (Exception e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
 }

}

原创粉丝点击