简易版的Java爬豆瓣网中的好友列表的实现

来源:互联网 发布:mac os x 10.7 iso 编辑:程序博客网 时间:2024/06/06 00:07

这是需要登录的界面,取得相应的验证码的图片并保存到本地,取得一个id

import java.net.*;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;


import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


class L
{
String str = "";
public L(String str)
{
this.str = str;
}
public void log()
{
try 
{
Document doc = Jsoup.connect(str).timeout(50000).get();
String infotable = doc.toString();
Elements infoTable = doc.select("img[id]");
Elements info = doc.getElementsByAttributeValue("class", "captcha_block");
Elements in = info.select("input");
Pattern p = Pattern.compile("http://(\\S[^\"]*)");
Matcher m = p.matcher(infoTable.toString());
if(m.find())
{
//System.out.println(m.group());
}
//System.out.println();
Pattern p1 = Pattern.compile("type=\"hidden\".*");
Matcher m1 = p1.matcher(in.toString());
if(m1.find())
{
//System.out.println(m1.group());
}
Pattern p2 = Pattern.compile("name=\"(\\S[^\"]*)");
Matcher m2 = p2.matcher(m1.group().toString());
if(m2.find())
{
//System.out.println(m2.group(1));
}
Pattern p3 = Pattern.compile("value=\"(\\S[^\"]*)");
Matcher m3 = p3.matcher(m1.group().toString());
if(m3.find())
System.out.println(m3.group(1));
URL url = new URL(m.group());
URLConnection uc = url.openConnection();
InputStream is = uc.getInputStream();
File file = new File("D:\\Users\\ibm\\Desktop\\douban.jpg");
FileOutputStream out = new FileOutputStream(file);
int i = 0;
while((i = is.read()) != -1)
{
out.write(i);
}
is.close();
}
catch(IOException e)
{
e.printStackTrace();
}
}
}
public class W 
{
public static void main(String args[])
{
L l = new L("http://www.douban.com/login");
l.log();
}

}


以下是具体实现爬取好友列表的代码,注意:我用的是BFS,广度优先遍历算法来爬取好友信息。

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.*;


import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


class Queue
{
LinkedList<String> queue = new LinkedList<String>();
public void enQueue(String t)
{
queue.addLast(t);
}
public Object deQueue()
{
return queue.removeFirst();
}
public boolean isQueueEmpty()
{
return queue.isEmpty();
}
public boolean contains(Object t)
{
return queue.contains(t);
}
}


class LinkQueue
{
public static Set<String> visitedUrl = new LinkedHashSet<String>();
public static Queue unVisitedUrl = new Queue();
public static Queue getUnVisitedUrl()
{
return unVisitedUrl;
}
public static void addVisitedUrl(String url)
{
visitedUrl.add(url);
}
public static void removeVisitedUrl(String url)
{
visitedUrl.remove(url);
}
public static Object unVisitedUrlDequeue()
{
return unVisitedUrl.deQueue();
}
public static void addUnvisitedUrl(String url)
{
if(url != null && !url.trim().equals("") && !visitedUrl.contains(url) && !unVisitedUrl.contains(url))
{
unVisitedUrl.enQueue(url);
}
}
public static int getVisitedUrlNum()
{
return visitedUrl.size();
}
public static boolean unVisitedUrlsEmpty()
{
return unVisitedUrl.isQueueEmpty();
}
}


class D
{
static HttpClient httpClient = new HttpClient();
//httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
//httpClient.getParams().setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
public static void log(String str)
{
try 
{
PostMethod postMethod = new PostMethod(str);
NameValuePair postData[] = new NameValuePair[5];
postData[0] = new NameValuePair("form_email","sunwangdong121212@163.com");
postData[1] = new NameValuePair("form_password","swd85153866");
postData[2] = new NameValuePair("source","index_nav");
postData[3] = new NameValuePair("captcha-solution","collar");
postData[4] = new NameValuePair("captcha-id","uMHK3Hgrhe4KaiyY6r6gCt8T:en");
postMethod.setRequestBody(postData);
int statusCode = httpClient.executeMethod(postMethod);
System.out.println(statusCode);
System.out.println("response=" + postMethod.getResponseBodyAsString());
if(statusCode == 302)
{
httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
httpClient.getParams().setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
Header header = postMethod.getResponseHeader("Location");
System.out.println(postMethod.getResponseHeader("Location"));
if(header != null)
{
   GetMethod redirect = new GetMethod("http://www.douban.com/contacts/list");
   httpClient.executeMethod(redirect);
   /*Cookie[] cookies = httpClient.getState().getCookies();
   for (int i = 0; i < cookies.length; i++) 
   {
    System.out.println("cookiename=="+cookies[i].getName() + " cookieValue=="+cookies[i].getValue());
   }*/
   System.out.println();
}
}
//postMethod.releaseConnection();
}
catch (IOException e) 
{
e.printStackTrace();
}
}


public  static Set<String> l;
public  static Set<String> getAddress(String url)
{
l = new LinkedHashSet<String>();
String st = new String();
String s = new String();
String string = new String();
Document doc = null;
try
{
GetMethod get = new GetMethod(url);          //url=http://www.douban.com/people/12123332/contacts
   httpClient.executeMethod(get);
   //System.out.println(get.getResponseBodyAsString());
   System.out.println();
   Pattern pat1 = Pattern.compile("e/(\\S[^\\/]*)");
Matcher mat1 = pat1.matcher(url);
if(mat1.find())
string = mat1.group(1);
   doc = Jsoup.parse(get.getResponseBodyAsString());
   Elements infoTable = doc.getElementsByAttributeValue("class", "obu");
   Elements info = infoTable.select("dt");
   Pattern pat = Pattern.compile("a href=\"(\\S[^\"]*)");
   Matcher mat = pat.matcher(info.toString());
   /*while(mat.find())
   {
    System.out.println(mat.group(1));
   }*/
System.out.println(string);
/*doc = Jsoup.parse("d:/Users/ibm/Desktop/关注度1.html");
   Elements info = doc.getElementsByAttributeValue("class", "obu");
   Elements in = info.select("dt");
   System.out.println(in.toString());
   Pattern pat = Pattern.compile("a href=\"(\\S[^\"]*)");
   Matcher mat = pat.matcher(in.toString());*/
   File f = new File("d:/Users/ibm/Desktop/关注度1.txt");
   FileWriter fw = new FileWriter(f,true);
   BufferedWriter bw = new BufferedWriter(fw);
   while(mat.find())
   {
   st = mat.group(1);
   Pattern pattern = Pattern.compile("e/(\\S[^/]*)");
   Matcher matcher = pattern.matcher(st);
   String s1 = "";
   while(matcher.find())
{
    s = matcher.group(1);
    //if(s.equals("ser_normal.jpg"));
    s1 = "http://www.douban.com/people/" + s + "/contacts";
    l.add(s1);
    System.out.println(s);
    bw.write(string + " -> " + s);
    bw.newLine();
    bw.flush();
}
   }
   bw.close();
}
catch(HttpStatusException e)
{
System.out.println("This is a error!");
}
catch(IOException e)
{
e.printStackTrace();
}
return l;
}
}


public class Z
{
//D d = new D();
static LinkQueue lq = new LinkQueue();
public void initZ(String[] seeds)
{

D.log("http://www.douban.com/login");
for(int i = 0; i < seeds.length; i++)
{
LinkQueue.addUnvisitedUrl(seeds[i]);
}
}
public void craw(String[] seeds) throws Exception
{
initZ(seeds);
while(! LinkQueue.unVisitedUrlsEmpty() && LinkQueue.getVisitedUrlNum() < 10)
{
String visitUrl = (String)LinkQueue.unVisitedUrlDequeue();
if(visitUrl == null)
continue;
else if(visitUrl != null)
{
Set<String> links = D.getAddress(visitUrl);
if(links != null)
{
LinkQueue.addVisitedUrl(visitUrl);
       for(String link : links)
       {
       LinkQueue.addUnvisitedUrl(link);
       }
       }
else if(links == null)
    {
    LinkQueue.addVisitedUrl(visitUrl);
    }
}
}
System.out.println(LinkQueue.getVisitedUrlNum());
}
public static void main(String args[]) throws Exception
{
Z z = new Z();
String seeds[] = {"http://www.douban.com/people/summertracy/contacts"};
z.craw(seeds);
}
}





0 0
原创粉丝点击