jsoup解析HTML,爬取小说实例

来源:互联网 发布:淘宝卖家 人工服务电话 编辑:程序博客网 时间:2024/05/16 10:04

1.java 的 File.separator 斜杠

2.jsoup解析标签,element的text()方法直接取出两个标签中间的文本

import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Test {public static void main(String[] args) throws Exception {// TODO Auto-generated method stubDocument doc = Jsoup.connect("http://www.biquge5.com/2_2975/1388243.html").get();Elements links = doc.select("a[href]");for (Element link:links){if (link.text().contentEquals("上一章")||link.text().contentEquals("下一章"))System.out.println(link.attr("abs:href").trim()+"---"+link.text());}Element content = doc.getElementById("content");//System.out.println(content.text());String [] sentences ;sentences = content.text().split(" ");for (String sen : sentences){sen = sen.trim();sen = sen+"\r\n";try {File dir = new File("F:"+File.separator+"book");if(!dir.exists()){dir.mkdirs();System.out.println("小说"+"F:"+File.separator+"book"+"目录下");}File file = new File( "F:"+File.separator+"book"+File.separator+"text.txt");FileOutputStream os = new FileOutputStream(file,true);try {os.write(sen.getBytes());os.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}}


0 0