新浪微博关键字搜索结果抓取

来源：互联网发布：扬州软件测试培训编辑：程序博客网时间：2024/05/14 02:51

我在实现抓取关键字搜索结果的时候确实遇到的验证码识别问题很蛋疼，一开始本来想破解验证码，后来查了资料发现，这个很难；就像实现手动填写验证码。

网上提供手动填写验证码的基本实现思路是：

1. 请求获取图片。

2. 保存图片至本地。

3. 打开图片。

4.手动填写验证码，模拟请求验证码识别验证。

后来看了别人的blog发现一个更简单暴力的方法，就是开浏览器填写验证码。

//打开浏览器，手动输入验证码
Desktop desktop = Desktop.getDesktop();
if (Desktop.isDesktopSupported()&& desktop.isSupported(Desktop.Action.BROWSE)) {
URI uri = new URI(searchUrl);
desktop.browse(uri);
}

这个方法虽然解决了验证码识别问题，但是需要人工反复填写验证码；抓取效率不高；只适合临时数据采集。

既然这个方法效率不高，那有没有其他实现办法呢！答案是有的，我们可以尝试抓取手机端新浪微博内容。

但是抓取手机端新浪微博的时候要注意额！抓取过于频繁ip可能会被封，一定要注意抓取时间间隔。代码如下：（其中的工具类代码就不贴了）

public void keyWordsSearch(String keyword){
try {
String searchHomeUrl = "http://weibo.cn/search/?tf=5_012&vt=4";
HtmlPage searchHomePage = client.getPage(searchHomeUrl);

//搜索表单
HtmlForm searchForm = searchHomePage.getForms().get(0);
HtmlTextInput keywordField = searchForm.getInputByName("keyword");
HtmlSubmitInput searchBlogField = searchForm.getInputByName("smblog");

//模拟设值搜索框操作
keywordField.click();
keywordField.setValueAttribute(keyword);
keywordField.blur();
//模拟设值搜索微博按钮操作
HtmlPage nowPage = searchBlogField.click();
// System.out.println(nowPage.asText());
HtmlPage nextPage = null;
HtmlAnchor nextAnchor = null;
for(int i=0; i<100; i++){
// System.out.println("第"+(i+1)+"页");
// System.out.println(nowPage.asText());
String responseContent = nowPage.getWebResponse().getContentAsString();
HtmlCleaner cleaner = new HtmlCleaner();
TagNode rootNode = cleaner.clean(responseContent);

Object[] cDivNodes = rootNode.evaluateXPath("//div[@class='c']");
if(cDivNodes!=null && cDivNodes.length>0){
for(int j=0;j<cDivNodes.length;j++){
TagNode cDivNode = (TagNode)cDivNodes[j];
//微博内容或者是转发理由
String textContent = null;
//微博id
String divID = cDivNode.getAttributeByName("id");
//用户昵称、用户id、用户博客url
String nickName=null;
String userHref=null;
String uid=null;
String repostCount=null;
String commentCount=null;
String source=null;
RepostBean repostBean=null;
if(divID!=null && divID.matches("M_A.*")){//微博内容层
//微博来源
Object[] sourceNodes = cDivNode.evaluateXPath("//span[@class='ct']");
if(sourceNodes!=null && sourceNodes.length>0){
TagNode sourceNode = (TagNode)sourceNodes[0];
source = MyStringUtils.pureString(sourceNode.getText().toString());
}
//用户信息层，提取昵称、用户id、博客url
Object[] nkLinkNodes = cDivNode.evaluateXPath("//a[@class='nk']");
if(nkLinkNodes!=null && nkLinkNodes.length>0){
TagNode nkLinkNode = (TagNode)nkLinkNodes[0];
nickName = MyStringUtils.pureString(nkLinkNode.getText().toString());
userHref = nkLinkNode.getAttributeByName("href");
//提取博客url中用户id信息
uid = MyStringUtils.takeUid4Href(userHref);
}else{
System.out.println("错错错。。。。。");
log.error("缺失用户信息，搜索微博无意义。。。");
}

Object[] divNodes = cDivNode.evaluateXPath("//div");
if(divNodes!=null){
if(divNodes.length==1 || divNodes.length==2){//原创微博,纯文字微博和带图片的微博
//微博文字内容
Object[] textNodes = cDivNode.evaluateXPath("//span[@class='ctt']");
if(textNodes!=null && textNodes.length>0){
TagNode textNode = (TagNode)textNodes[0];
textContent = MyStringUtils.pureString(textNode.getText().toString());
}
//微博转发次数、评论次数
TagNode statuNode = (TagNode)divNodes[divNodes.length-1];//最后一层
Object[] linkNodes = statuNode.evaluateXPath("//a");
if(linkNodes!=null && linkNodes.length>0){
for(int k=0;k<linkNodes.length;k++){
TagNode linkNode = (TagNode)linkNodes[k];
String linkText = MyStringUtils.pureString(linkNode.getText().toString());
if(linkText!=null){
if(linkText.contains("转发")){
repostCount = MyStringUtils.takeDigit4Str(linkText);
}else if(linkText.contains("评论")){
commentCount = MyStringUtils.takeDigit4Str(linkText);
}
}
}
}
}else if(divNodes.length==3){//转发微博
repostBean = new RepostBean();
//第一层，提取转发微博的原作者uid、昵称
TagNode firstDivNode = (TagNode)divNodes[0];
Object[] aLinkNodes = firstDivNode.evaluateXPath("//span[@class='cmt']/a");
if(aLinkNodes!=null&&aLinkNodes.length>0){
TagNode aLinkNode = (TagNode)aLinkNodes[0];
String originalNickName = MyStringUtils.pureString(aLinkNode.getText().toString());
//设置转发微博原创昵称
repostBean.setNickName(originalNickName);
String originalUserHref = aLinkNode.getAttributeByName("href");
//设置转发微博原创uid
repostBean.setUid(MyStringUtils.takeUid4Href(originalUserHref));

}
Object[] originalContentNodes = firstDivNode.evaluateXPath("//span[@class='ctt']");
if(originalContentNodes!=null && originalContentNodes.length>0){
TagNode originalContentNode = (TagNode)originalContentNodes[0];
String originalContent = MyStringUtils.pureString(originalContentNode.getText().toString());
//设置转发微博原创内容
repostBean.setContent(originalContent);
}
//第二层，提取转发微博的转发数、评论数
TagNode secondDivNode = (TagNode)divNodes[1];
Object[] cmtNodes = secondDivNode.evaluateXPath("//span[@class='cmt']");
if(cmtNodes!=null && cmtNodes.length>0){
TagNode oriRepostCountNode = (TagNode)cmtNodes[cmtNodes.length-1];
String oriRepostCountText = MyStringUtils.pureString(oriRepostCountNode.getText().toString());
if(oriRepostCountText!=null && oriRepostCountText.contains("转发")){
//设置转发微博原转发数
repostBean.setRepostCount(MyStringUtils.takeDigit4Str(oriRepostCountText));
}

}
Object[] ccNodes = secondDivNode.evaluateXPath("//a[@class='cc']");
if(ccNodes!=null && ccNodes.length>0){
TagNode oriCommentCountNode = (TagNode)ccNodes[0];
String oriCommentCountText = MyStringUtils.pureString(oriCommentCountNode.getText().toString());
if(oriCommentCountText!=null && oriCommentCountText.contains("评论")){
//设置转发微博原转发数
repostBean.setCommentCount(MyStringUtils.takeDigit4Str(oriCommentCountText));
}

}
//第三层，提取微博的转发理由（微博内容）、转发数、评论数
TagNode threeDivNode = (TagNode)divNodes[2];
Object[] aCountLinkNodes = threeDivNode.evaluateXPath("//a");
if(aCountLinkNodes!=null && aCountLinkNodes.length>0){
for(int k=0;k<aCountLinkNodes.length;k++){
TagNode aCountLinkNode = (TagNode)aCountLinkNodes[k];
String linkText = MyStringUtils.pureString(aCountLinkNode.getText().toString());
if(linkText!=null && linkText.contains("转发")){
repostCount = MyStringUtils.takeDigit4Str(linkText);
}
if(linkText!=null && linkText.contains("评论")){
commentCount = MyStringUtils.takeDigit4Str(linkText);
}
}
}
textContent = MyStringUtils.pureString(threeDivNode.getText().toString());
//去杂质，将转发理由后面的赞、转发数、评论数去掉
if(textContent!=null && !"".equals(textContent)){
int cutEndIdx = textContent.lastIndexOf("赞");
if(cutEndIdx>0){
textContent = textContent.substring(0, cutEndIdx);
}
}

}else{
log.error("既不是原创也不是转发。。。。div层数："+divNodes.length);
System.out.println("既不是原创也不是转发。。。。div层数："+divNodes.length);
}
}

if(repostBean!=null){
log.info(keyword+"€"+nickName+"€"+uid+"€"+userHref+"€"+divID+"€"+source+"€"+repostCount+"€"+commentCount
+"€"+repostBean.getNickName()+"€"+repostBean.getUid()+"€"+repostBean.getRepostCount()+"€"
+repostBean.getCommentCount()+"€"+repostBean.getContent()+"€"+textContent);
}else{
log.info(keyword+"€"+nickName+"€"+uid+"€"+userHref+"€"+divID+"€"+source+"€"+repostCount+"€"+commentCount+"€"+textContent);
}

}
}
}else{
System.out.println("搜索结果页面无微博内容。。。。");
log.error("搜索结果页面无微博内容。。。。");
}
System.out.println("提取第【"+(i+1)+"】页。。。。");

nextAnchor = nowPage.getAnchorByText("下页");
nextPage = nextAnchor.click();
nowPage = nextPage;
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

0 0