简易版的Java爬豆瓣网中的好友列表的实现
来源:互联网 发布:mac os x 10.7 iso 编辑:程序博客网 时间:2024/06/06 00:07
这是需要登录的界面,取得相应的验证码的图片并保存到本地,取得一个id
import java.net.*;
import java.util.regex.Matcher;import java.util.regex.Pattern;
import java.io.*;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
class L
{
String str = "";
public L(String str)
{
this.str = str;
}
public void log()
{
try
{
Document doc = Jsoup.connect(str).timeout(50000).get();
String infotable = doc.toString();
Elements infoTable = doc.select("img[id]");
Elements info = doc.getElementsByAttributeValue("class", "captcha_block");
Elements in = info.select("input");
Pattern p = Pattern.compile("http://(\\S[^\"]*)");
Matcher m = p.matcher(infoTable.toString());
if(m.find())
{
//System.out.println(m.group());
}
//System.out.println();
Pattern p1 = Pattern.compile("type=\"hidden\".*");
Matcher m1 = p1.matcher(in.toString());
if(m1.find())
{
//System.out.println(m1.group());
}
Pattern p2 = Pattern.compile("name=\"(\\S[^\"]*)");
Matcher m2 = p2.matcher(m1.group().toString());
if(m2.find())
{
//System.out.println(m2.group(1));
}
Pattern p3 = Pattern.compile("value=\"(\\S[^\"]*)");
Matcher m3 = p3.matcher(m1.group().toString());
if(m3.find())
System.out.println(m3.group(1));
URL url = new URL(m.group());
URLConnection uc = url.openConnection();
InputStream is = uc.getInputStream();
File file = new File("D:\\Users\\ibm\\Desktop\\douban.jpg");
FileOutputStream out = new FileOutputStream(file);
int i = 0;
while((i = is.read()) != -1)
{
out.write(i);
}
is.close();
}
catch(IOException e)
{
e.printStackTrace();
}
}
}
public class W
{
public static void main(String args[])
{
L l = new L("http://www.douban.com/login");
l.log();
}
}
以下是具体实现爬取好友列表的代码,注意:我用的是BFS,广度优先遍历算法来爬取好友信息。
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.*;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
class Queue
{
LinkedList<String> queue = new LinkedList<String>();
public void enQueue(String t)
{
queue.addLast(t);
}
public Object deQueue()
{
return queue.removeFirst();
}
public boolean isQueueEmpty()
{
return queue.isEmpty();
}
public boolean contains(Object t)
{
return queue.contains(t);
}
}
class LinkQueue
{
public static Set<String> visitedUrl = new LinkedHashSet<String>();
public static Queue unVisitedUrl = new Queue();
public static Queue getUnVisitedUrl()
{
return unVisitedUrl;
}
public static void addVisitedUrl(String url)
{
visitedUrl.add(url);
}
public static void removeVisitedUrl(String url)
{
visitedUrl.remove(url);
}
public static Object unVisitedUrlDequeue()
{
return unVisitedUrl.deQueue();
}
public static void addUnvisitedUrl(String url)
{
if(url != null && !url.trim().equals("") && !visitedUrl.contains(url) && !unVisitedUrl.contains(url))
{
unVisitedUrl.enQueue(url);
}
}
public static int getVisitedUrlNum()
{
return visitedUrl.size();
}
public static boolean unVisitedUrlsEmpty()
{
return unVisitedUrl.isQueueEmpty();
}
}
class D
{
static HttpClient httpClient = new HttpClient();
//httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
//httpClient.getParams().setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
public static void log(String str)
{
try
{
PostMethod postMethod = new PostMethod(str);
NameValuePair postData[] = new NameValuePair[5];
postData[0] = new NameValuePair("form_email","sunwangdong121212@163.com");
postData[1] = new NameValuePair("form_password","swd85153866");
postData[2] = new NameValuePair("source","index_nav");
postData[3] = new NameValuePair("captcha-solution","collar");
postData[4] = new NameValuePair("captcha-id","uMHK3Hgrhe4KaiyY6r6gCt8T:en");
postMethod.setRequestBody(postData);
int statusCode = httpClient.executeMethod(postMethod);
System.out.println(statusCode);
System.out.println("response=" + postMethod.getResponseBodyAsString());
if(statusCode == 302)
{
httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
httpClient.getParams().setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
Header header = postMethod.getResponseHeader("Location");
System.out.println(postMethod.getResponseHeader("Location"));
if(header != null)
{
GetMethod redirect = new GetMethod("http://www.douban.com/contacts/list");
httpClient.executeMethod(redirect);
/*Cookie[] cookies = httpClient.getState().getCookies();
for (int i = 0; i < cookies.length; i++)
{
System.out.println("cookiename=="+cookies[i].getName() + " cookieValue=="+cookies[i].getValue());
}*/
System.out.println();
}
}
//postMethod.releaseConnection();
}
catch (IOException e)
{
e.printStackTrace();
}
}
public static Set<String> l;
public static Set<String> getAddress(String url)
{
l = new LinkedHashSet<String>();
String st = new String();
String s = new String();
String string = new String();
Document doc = null;
try
{
GetMethod get = new GetMethod(url); //url=http://www.douban.com/people/12123332/contacts
httpClient.executeMethod(get);
//System.out.println(get.getResponseBodyAsString());
System.out.println();
Pattern pat1 = Pattern.compile("e/(\\S[^\\/]*)");
Matcher mat1 = pat1.matcher(url);
if(mat1.find())
string = mat1.group(1);
doc = Jsoup.parse(get.getResponseBodyAsString());
Elements infoTable = doc.getElementsByAttributeValue("class", "obu");
Elements info = infoTable.select("dt");
Pattern pat = Pattern.compile("a href=\"(\\S[^\"]*)");
Matcher mat = pat.matcher(info.toString());
/*while(mat.find())
{
System.out.println(mat.group(1));
}*/
System.out.println(string);
/*doc = Jsoup.parse("d:/Users/ibm/Desktop/关注度1.html");
Elements info = doc.getElementsByAttributeValue("class", "obu");
Elements in = info.select("dt");
System.out.println(in.toString());
Pattern pat = Pattern.compile("a href=\"(\\S[^\"]*)");
Matcher mat = pat.matcher(in.toString());*/
File f = new File("d:/Users/ibm/Desktop/关注度1.txt");
FileWriter fw = new FileWriter(f,true);
BufferedWriter bw = new BufferedWriter(fw);
while(mat.find())
{
st = mat.group(1);
Pattern pattern = Pattern.compile("e/(\\S[^/]*)");
Matcher matcher = pattern.matcher(st);
String s1 = "";
while(matcher.find())
{
s = matcher.group(1);
//if(s.equals("ser_normal.jpg"));
s1 = "http://www.douban.com/people/" + s + "/contacts";
l.add(s1);
System.out.println(s);
bw.write(string + " -> " + s);
bw.newLine();
bw.flush();
}
}
bw.close();
}
catch(HttpStatusException e)
{
System.out.println("This is a error!");
}
catch(IOException e)
{
e.printStackTrace();
}
return l;
}
}
public class Z
{
//D d = new D();
static LinkQueue lq = new LinkQueue();
public void initZ(String[] seeds)
{
D.log("http://www.douban.com/login");
for(int i = 0; i < seeds.length; i++)
{
LinkQueue.addUnvisitedUrl(seeds[i]);
}
}
public void craw(String[] seeds) throws Exception
{
initZ(seeds);
while(! LinkQueue.unVisitedUrlsEmpty() && LinkQueue.getVisitedUrlNum() < 10)
{
String visitUrl = (String)LinkQueue.unVisitedUrlDequeue();
if(visitUrl == null)
continue;
else if(visitUrl != null)
{
Set<String> links = D.getAddress(visitUrl);
if(links != null)
{
LinkQueue.addVisitedUrl(visitUrl);
for(String link : links)
{
LinkQueue.addUnvisitedUrl(link);
}
}
else if(links == null)
{
LinkQueue.addVisitedUrl(visitUrl);
}
}
}
System.out.println(LinkQueue.getVisitedUrlNum());
}
public static void main(String args[]) throws Exception
{
Z z = new Z();
String seeds[] = {"http://www.douban.com/people/summertracy/contacts"};
z.craw(seeds);
}
}
- 简易版的Java爬豆瓣网中的好友列表的实现
- 好友列表的实现
- 好友列表的实现
- Java学习笔记-------好友列表的实现------卡片布局 实例
- QQ - iPhone 风格的好友列表实现
- 实现好友列表的显示以及隐藏
- js模拟QQ好友列表的实现
- vue的简易豆瓣图书
- 实现二级列表(实现QQ的好友列表)
- iOS 简易的实现类似QQ好友菜单
- 如何实现IM中的好友列表
- 模仿QQ好友列表的ExpandableListView实现的效
- IOS详解TableView —— QQ好友列表的实现
- IOS详解TableView —— QQ好友列表的实现
- 65 Android ExpandableListView (QQ好友列表的实现)
- 自定义ExpandableListView 实现像QQ好友列表一样的功能
- Android开发学习之QQ好友列表的实现
- JavaScript实现的购物车效果-好友列表效果
- 软件过程管理在软件项目中的作用
- NYIST 116 士兵杀敌(二)
- 冬令营一月二十九号工作日志
- 深入理解Android(04)——深入理解属性服务
- ios 实现引导页面效果
- 简易版的Java爬豆瓣网中的好友列表的实现
- Goldengate升级之目标端(replicat端)升级
- C++ 顶层底层 const
- 【分治法】循环赛事日程表
- Java中String类的一些方法的返回值的结果
- 截图之路01
- 线程轮询和线程回调
- 使用SenchaCMD搭建ExtJs5.1 MVVM开发环境
- MD5加密的C#程序