Jsoup网页爬虫案例
来源:互联网 发布:游戏编程入门自学书籍 编辑:程序博客网 时间:2024/05/18 01:29
最近我的一个软件要改版。做了一个demo用于演示。在这分享给大家。共同学习如何抓取HTML代码
package cn.oschina.net;import android.app.Activity;import android.app.ProgressDialog;import android.content.Context;import android.content.DialogInterface;import android.content.Intent;import android.os.AsyncTask;import android.os.Bundle;import android.util.Log;import android.view.View;import android.widget.AdapterView;import android.widget.ListView;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;import java.lang.ref.SoftReference;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;public class MainActivity extends Activity { private static final String TAG ="MainActivity" ; ListView listView; TitleAdapter mAdapter; /** Called when the activity is first created. */ @Override public void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.login); listView= (ListView) findViewById(R.id.lv_main); new PageTask(this).execute(); } private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> { // 可变长的输入参数,与AsyncTask.exucute()对应 ProgressDialog pdialog; MyCache cache ; public PageTask(Context context) { cache=(MyCache) context.getApplicationContext(); pdialog = new ProgressDialog(context, 0); pdialog.setTitle("conneting...."); pdialog.setButton("cancel", new DialogInterface.OnClickListener() { public void onClick(DialogInterface dialog, int i) { dialog.cancel(); } }); pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() { public void onCancel(DialogInterface dialog) { finish(); } }); // pdialog.setCancelable(true); pdialog.setMax(100); pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL); pdialog.show(); } @Override protected List<Map<String, Object>> doInBackground(String... params) { List<Map<String, Object>> arr = null; try { int count=1; int length=100; if(cache.getCacheDoc().containsKey("doc")){ SoftReference<List<Map<String, Object>>> soft= cache.getCacheDoc().get("doc"); arr = soft.get(); for(int i=1;i<=100;i++){ publishProgress((int) ((i / (float) length) * 100)); } Log.v("OSchina-","read cache"); }else{ Log.v("OSchina-","not read cache"); arr = new ArrayList<Map<String,Object>>(); Document doc = Jsoup.connect( "http://fotomen.cn/") .timeout(30000).post(); Log.v("OSchina-","request over:"+(doc!=null)); Elements titleElement = doc.select("div.cb-article-meta"); for(int i=0; i<titleElement.size();i++){ String title= titleElement.get(i).select("h2").text(); String linkHref = titleElement.get(i).getElementsByTag("a").attr("href"); Log.d(TAG,"title====="+title); Log.d(TAG,"linkHref====="+linkHref); HashMap<String,Object> hashMap = new HashMap<String, Object>(); hashMap.put("title",title); //标题 hashMap.put("url",linkHref); //文章url arr.add(hashMap); } cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr)); } } catch (Exception e) { Log.e("doInBackground", "--"+e); } return arr; } @Override protected void onCancelled() { super.onCancelled(); } @Override protected void onPostExecute(final List<Map<String, Object>> result) { final List<Map<String, Object>> data = result; pdialog.dismiss(); if(result!=null){ mAdapter=new TitleAdapter(MainActivity.this,result); listView.setAdapter(mAdapter); listView.setOnItemClickListener(new AdapterView.OnItemClickListener() { @Override public void onItemClick(AdapterView<?> adapterView, View view, int i, long l) { Intent intent = new Intent(MainActivity.this,OSchinaMain.class); intent.putExtra("url",data.get(i).get("url").toString()); startActivity(intent); } }); }else{ Log.d(TAG,"result====null"); } } @Override protected void onPreExecute() { // 任务启动,可以在这里显示一个对话框,这里简单处理 // message.setText(R.string.task_started); } @Override protected void onProgressUpdate(Integer... values) { // 更新进度 //System.out.println("" + values[0]); // message.setText(""+values[0]); pdialog.setProgress(values[0]); } }}
package cn.oschina.net;import java.lang.ref.SoftReference;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import android.content.Intent;import android.widget.LinearLayout;import android.widget.TextView;import android.app.Activity;import android.app.ProgressDialog;import android.content.Context;import android.content.DialogInterface;import android.os.AsyncTask;import android.os.Bundle;import android.util.Log;import android.widget.Toast;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;public class OSchinaMain extends Activity { private static final String TAG ="OSchinaMain" ; private TextView textView; private LinearLayout ll_all; Map<String,SoftReference<List<Map<String, Object>>>> cacheDoc; /*public OSchinaMain(){MyCache cache = (MyCache) getApplicationContext(); cacheDoc= cache.getCacheDoc();}*/@Overridepublic void onCreate(Bundle savedInstanceState) {super.onCreate(savedInstanceState);setContentView(R.layout.main); ll_all= (LinearLayout) findViewById(R.id.ll_all); textView= (TextView) findViewById(R.id.tv_readContent); Intent intent =getIntent(); String url=intent.getStringExtra("url");PageTask task = new PageTask(this,url);task.execute(null);}private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> {// 可变长的输入参数,与AsyncTask.exucute()对应ProgressDialog pdialog;MyCache cache ; String url;public PageTask(Context context,String url) {cache=(MyCache) context.getApplicationContext(); this.url=url; Log.v("OSchina-","url=="+url);pdialog = new ProgressDialog(context, 0);pdialog.setTitle("正在连接请稍候....");pdialog.setButton("cancel", new DialogInterface.OnClickListener() {public void onClick(DialogInterface dialog, int i) {dialog.cancel();}});pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() {public void onCancel(DialogInterface dialog) {finish();}});pdialog.setMax(100);pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);pdialog.show();}@Overrideprotected List<Map<String, Object>> doInBackground(String... params) {List<Map<String, Object>> arr = null;try {int count=1;int length=100;Log.v("OSchina-","不走缓存");arr = new ArrayList<Map<String,Object>>();Document doc = Jsoup.connect( url).timeout(8000).post(); Log.v("OSchina-","请求结束:"+(doc!=null)); //文章时间 String contentTime= doc.select("span.cb-title-fi").select("time.updated").get(0).text(); //作者 String author= doc.select("span.fn").get(0).text(); //标题 String title= doc.select("span.cb-title-fi").select("h1").get(0).text(); //文章内容(html 包含图片地址 p标签等) String article = doc.select("article").attr("section", "articleBody").text(); Log.d("OSchina-","title:"+title); Log.d("OSchina-","author:"+author); Log.d("OSchina-","contentTime:"+contentTime); Log.d("OSchina-","article:"+article); HashMap<String,Object> hashMap = new HashMap<String, Object>(); hashMap.put("title",title); hashMap.put("author",author); hashMap.put("contentTime",contentTime); hashMap.put("article",article); arr.add(hashMap);cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr));} catch (Exception e) {Log.e("doInBackground", "--"+e); Toast.makeText(OSchinaMain.this,"连接超时",1).show(); finish(); }return arr;}@Overrideprotected void onCancelled() {super.onCancelled();}@Overrideprotected void onPostExecute(List<Map<String, Object>> result) { for(int i=0;i<result.size();i++){ String title = (String) result.get(i).get("title"); String author = (String) result.get(i).get("author"); String contentTime = (String) result.get(i).get("contentTime"); String article = (String) result.get(i).get("article"); textView.append(title+"\n"); textView.append(author+"\n"); textView.append(contentTime+"\n"); textView.append(article); ll_all.removeAllViews(); ll_all.addView(textView); }// 返回HTML页面的内容pdialog.dismiss();}@Overrideprotected void onPreExecute() {// 任务启动,可以在这里显示一个对话框,这里简单处理// message.setText(R.string.task_started);}@Overrideprotected void onProgressUpdate(Integer... values) {// 更新进度//System.out.println("" + values[0]);// message.setText(""+values[0]);pdialog.setProgress(values[0]);}}}
以fotomen.cn为例,首页取出最新的几篇文章展示在列表,点列表可以展示文章的具体内容。
此demo的下载地址是:http://download.csdn.net/detail/ligl0702/7001333
0 0
- Jsoup网页爬虫案例
- 网页爬虫框架jsoup介绍
- jsoup爬虫爬取网页
- Jsoup网页数据抓取案例
- jsoup爬虫网页数据出现异常
- java 爬虫 网页解析(Jsoup)
- 【爬虫系列】第二部分 网页解析Jsoup
- java爬虫--jsoup简单的表单抓取案例
- jsoup 爬虫
- WebCollector爬虫使用内置的Jsoup进行网页抽取
- Jsoup 实现的基于列表-详情结构的网页爬虫
- Android利用jsoup爬虫爬网页数据(一)
- Android利用jsoup爬虫爬网页数据(二)
- java爬虫(使用jsoup设置代理,抓取网页内容)
- Android Jsoup网页爬虫—>程序猿面试指南App
- 爬虫系列(二)——网页解析Jsoup
- 【Java Utility】Jsoup网页爬虫工具--处理URL链接【十】
- 【Java Utility】Jsoup网页爬虫工具--设置属性值【十一】
- iOS 7.1随时都有可能到来 最迟不过下个星期
- 开启心灵的钥匙
- C语言 struct大小、首地址与内存对齐
- Spring MVC 3.1多视图协商配置(json、xml、freemarker)
- C++ 更改控制台显示文本颜色
- Jsoup网页爬虫案例
- java.lang.IncompatibleClassChangeError
- 浅谈ASP.net处理XML数据
- 标准C++中的string类的用法总结
- tableViewCell 自适应高度
- 使用CXF创建REST WEBSERVICE
- 硬盘格式化了怎么办
- linux忘记root密码的解决方法
- Android导入一个工程时提示 Invalid project description的解决方法