Java 爬虫

来源：互联网发布：淘宝退货不要菜鸟驿站编辑：程序博客网时间：2024/05/16 01:13
Java 的一个网页爬虫
package calcium.tools.grex;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;public class MyGrex {public static void main(String[] args) throws IOException {PrintWriter pw = new PrintWriter(new FileWriter("F:a.txt"));StringBuffer buf = new StringBuffer(1024 * 1024);String html = "F:a.html";readTxtFile(html, buf);//String html = "http://bbs.zhiyoo.com/forum.php?mod=modcp&action=thread&op=thread";//readHtml(html, buf);List<String> a = new ArrayList<String>();a = getLink(buf.toString());List<String> b = new ArrayList<String>();b = getHuifu(buf.toString());List<String> c = new ArrayList<String>();c = getDianji(buf.toString());getLinkl(buf.toString());// Pattern pattern = Pattern.compile("href=\"(.+?)\"");// Matcher matcher = pattern.matcher("<a href=\"index.html\">主页</a>");// if(matcher.find()) {// System.out.println(matcher.group(1));// }for (int i = 0; i < a.size(); i++) {System.out.println(a.get(i));pw.write(a.get(i));pw.write("V");pw.write(b.get(i));pw.write("V");pw.write(c.get(i));pw.write("\n");}pw.close();}public static List<String> getLink(String html) throws IOException {Pattern p = Pattern.compile("<a href=\"http://bbs.zhiyoo.com/forum.php"+ "\\?mod=viewthread&tid=[0-9]{7}\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>"+ "(.*?)</a>");Matcher m = p.matcher(html);ArrayList<String> alist = new ArrayList<String>();while (m.find()) {alist.add(m.group(2));System.out.println(m.group(2));}return alist;}// 中文：[^x00-xff]*public static List<String> getLinkl(String html) throws IOException {Pattern p = Pattern.compile("<span id=\"thread_[0-9]{7}\"><a href=\"(.*?)\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>[\\s\\S]</a><span>");Matcher m = p.matcher(html);ArrayList<String> alist = new ArrayList<String>();while (m.find()) {alist.add(m.group(1));System.out.println(m.group(1));System.out.println(m.group());}return alist;}public static List<String> getHuifu(String s) {// <span class="xi2">31</span><em>374</em><a.*?</a>String regex = "<span class=\"xi2\">(.+?)</span>";Pattern pa = Pattern.compile(regex, Pattern.DOTALL);Matcher ma = pa.matcher(s);List<String> blist = new ArrayList<String>();while (ma.find()) {blist.add(ma.group(1));System.out.println(ma.group(1));}return blist;}public static List<String> getDianji(String s) {// <td class="num"><span class="xi2">25</span><em>504</em></td>String regex = "<td class=\"num\"><span class=\"xi2\">[0-9]{2}</span><em>(.+?)</em></td>";Pattern pa = Pattern.compile(regex, Pattern.DOTALL);Matcher ma = pa.matcher(s);List<String> clist = new ArrayList<String>();while (ma.find()) {clist.add(ma.group(1));System.out.println(ma.group(1));}return clist;}public static void readTxtFile(String filePath, StringBuffer buf) {try {String encoding = "GBK";File file = new File(filePath);if (file.isFile() && file.exists()) { // �ж��ļ��Ƿ����InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// ���ǵ������ʽBufferedReader bufferedReader = new BufferedReader(read);String lineTxt = null;while ((lineTxt = bufferedReader.readLine()) != null) {// System.out.println(lineTxt);buf.append(lineTxt);}read.close();} else {System.out.println("�Ҳ���ָ�����ļ�");}} catch (Exception e) {System.out.println("��ȡ�ļ����ݳ���");e.printStackTrace();}}public static void readHtml(String html,StringBuffer buf) {try {String encoding = "GBK";URL url = new URL(html);InputStreamReader read = new InputStreamReader(url.openStream(),encoding);BufferedReader bufferedReader = new BufferedReader(read);while (bufferedReader.readLine() != null) {String s = bufferedReader.readLine();buf.append(s);System.out.println(s);}read.close();} catch (Exception e) {// TODO: handle exceptionSystem.out.println("null");}}<strong>}</strong>
0 0