Jsoup网页爬虫案例

来源:互联网 发布:游戏编程入门自学书籍 编辑:程序博客网 时间:2024/05/18 01:29

       最近我的一个软件要改版。做了一个demo用于演示。在这分享给大家。共同学习如何抓取HTML代码


package cn.oschina.net;import android.app.Activity;import android.app.ProgressDialog;import android.content.Context;import android.content.DialogInterface;import android.content.Intent;import android.os.AsyncTask;import android.os.Bundle;import android.util.Log;import android.view.View;import android.widget.AdapterView;import android.widget.ListView;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;import java.lang.ref.SoftReference;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;public class MainActivity extends Activity {    private static final String TAG ="MainActivity" ;    ListView listView;    TitleAdapter mAdapter;    /** Called when the activity is first created. */    @Override    public void onCreate(Bundle savedInstanceState) {        super.onCreate(savedInstanceState);        setContentView(R.layout.login);        listView= (ListView) findViewById(R.id.lv_main);        new PageTask(this).execute();    }    private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> {        // 可变长的输入参数,与AsyncTask.exucute()对应        ProgressDialog pdialog;        MyCache cache ;        public PageTask(Context context) {            cache=(MyCache) context.getApplicationContext();            pdialog = new ProgressDialog(context, 0);            pdialog.setTitle("conneting....");            pdialog.setButton("cancel", new DialogInterface.OnClickListener() {                public void onClick(DialogInterface dialog, int i) {                    dialog.cancel();                }            });            pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() {                public void onCancel(DialogInterface dialog) {                    finish();                }            });            // pdialog.setCancelable(true);            pdialog.setMax(100);            pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);            pdialog.show();        }        @Override        protected List<Map<String, Object>> doInBackground(String... params) {            List<Map<String, Object>> arr = null;            try {                int count=1;                int length=100;                if(cache.getCacheDoc().containsKey("doc")){                    SoftReference<List<Map<String, Object>>> soft= cache.getCacheDoc().get("doc");                    arr = soft.get();                    for(int i=1;i<=100;i++){                        publishProgress((int) ((i / (float) length) * 100));                    }                    Log.v("OSchina-","read cache");                }else{                    Log.v("OSchina-","not read cache");                    arr = new ArrayList<Map<String,Object>>();                    Document doc = Jsoup.connect(                            "http://fotomen.cn/")                            .timeout(30000).post();                    Log.v("OSchina-","request over:"+(doc!=null));                    Elements titleElement = doc.select("div.cb-article-meta");                    for(int i=0; i<titleElement.size();i++){                        String title= titleElement.get(i).select("h2").text();                        String linkHref = titleElement.get(i).getElementsByTag("a").attr("href");                        Log.d(TAG,"title====="+title);                        Log.d(TAG,"linkHref====="+linkHref);                        HashMap<String,Object> hashMap = new HashMap<String, Object>();                        hashMap.put("title",title);  //标题                        hashMap.put("url",linkHref);   //文章url                        arr.add(hashMap);                     }                    cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr));                }            } catch (Exception e) {                Log.e("doInBackground", "--"+e);            }            return arr;        }        @Override        protected void onCancelled() {            super.onCancelled();        }        @Override        protected void onPostExecute(final List<Map<String, Object>> result) {            final List<Map<String, Object>> data = result;         pdialog.dismiss();                if(result!=null){                    mAdapter=new TitleAdapter(MainActivity.this,result);                    listView.setAdapter(mAdapter);                    listView.setOnItemClickListener(new AdapterView.OnItemClickListener() {                        @Override                        public void onItemClick(AdapterView<?> adapterView, View view, int i, long l) {                            Intent intent = new Intent(MainActivity.this,OSchinaMain.class);                            intent.putExtra("url",data.get(i).get("url").toString());                            startActivity(intent);                        }                    });                }else{                    Log.d(TAG,"result====null");                }        }        @Override        protected void onPreExecute() {            // 任务启动,可以在这里显示一个对话框,这里简单处理            // message.setText(R.string.task_started);        }        @Override        protected void onProgressUpdate(Integer... values) {            // 更新进度            //System.out.println("" + values[0]);            // message.setText(""+values[0]);            pdialog.setProgress(values[0]);        }    }}

package cn.oschina.net;import java.lang.ref.SoftReference;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import android.content.Intent;import android.widget.LinearLayout;import android.widget.TextView;import android.app.Activity;import android.app.ProgressDialog;import android.content.Context;import android.content.DialogInterface;import android.os.AsyncTask;import android.os.Bundle;import android.util.Log;import android.widget.Toast;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;public class OSchinaMain extends Activity {    private static final String TAG ="OSchinaMain" ;    private TextView textView;    private LinearLayout ll_all;    Map<String,SoftReference<List<Map<String, Object>>>> cacheDoc;    /*public OSchinaMain(){MyCache cache = (MyCache) getApplicationContext();        cacheDoc= cache.getCacheDoc();}*/@Overridepublic void onCreate(Bundle savedInstanceState) {super.onCreate(savedInstanceState);setContentView(R.layout.main);        ll_all= (LinearLayout) findViewById(R.id.ll_all);        textView= (TextView) findViewById(R.id.tv_readContent);        Intent intent =getIntent();        String url=intent.getStringExtra("url");PageTask task = new PageTask(this,url);task.execute(null);}private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> {// 可变长的输入参数,与AsyncTask.exucute()对应ProgressDialog pdialog;MyCache cache ;        String url;public PageTask(Context context,String url) {cache=(MyCache) context.getApplicationContext();            this.url=url;            Log.v("OSchina-","url=="+url);pdialog = new ProgressDialog(context, 0);pdialog.setTitle("正在连接请稍候....");pdialog.setButton("cancel", new DialogInterface.OnClickListener() {public void onClick(DialogInterface dialog, int i) {dialog.cancel();}});pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() {public void onCancel(DialogInterface dialog) {finish();}});pdialog.setMax(100);pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);pdialog.show();}@Overrideprotected List<Map<String, Object>> doInBackground(String... params) {List<Map<String, Object>> arr = null;try {int count=1;int length=100;Log.v("OSchina-","不走缓存");arr = new ArrayList<Map<String,Object>>();Document doc = Jsoup.connect(                            url).timeout(8000).post();                    Log.v("OSchina-","请求结束:"+(doc!=null));                    //文章时间                    String  contentTime=  doc.select("span.cb-title-fi").select("time.updated").get(0).text();                    //作者                    String author= doc.select("span.fn").get(0).text();                    //标题                    String title= doc.select("span.cb-title-fi").select("h1").get(0).text();                    //文章内容(html 包含图片地址  p标签等)                    String article = doc.select("article").attr("section", "articleBody").text();                    Log.d("OSchina-","title:"+title);                    Log.d("OSchina-","author:"+author);                    Log.d("OSchina-","contentTime:"+contentTime);                    Log.d("OSchina-","article:"+article);                    HashMap<String,Object> hashMap = new HashMap<String, Object>();                    hashMap.put("title",title);                    hashMap.put("author",author);                    hashMap.put("contentTime",contentTime);                    hashMap.put("article",article);                    arr.add(hashMap);cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr));} catch (Exception e) {Log.e("doInBackground", "--"+e);                Toast.makeText(OSchinaMain.this,"连接超时",1).show();                finish();            }return arr;}@Overrideprotected void onCancelled() {super.onCancelled();}@Overrideprotected void onPostExecute(List<Map<String, Object>> result) {             for(int i=0;i<result.size();i++){                 String title = (String) result.get(i).get("title");                 String author = (String) result.get(i).get("author");                 String contentTime = (String) result.get(i).get("contentTime");                 String article = (String) result.get(i).get("article");                 textView.append(title+"\n");                 textView.append(author+"\n");                 textView.append(contentTime+"\n");                 textView.append(article);                 ll_all.removeAllViews();                 ll_all.addView(textView);             }// 返回HTML页面的内容pdialog.dismiss();}@Overrideprotected void onPreExecute() {// 任务启动,可以在这里显示一个对话框,这里简单处理// message.setText(R.string.task_started);}@Overrideprotected void onProgressUpdate(Integer... values) {// 更新进度//System.out.println("" + values[0]);// message.setText(""+values[0]);pdialog.setProgress(values[0]);}}}

以fotomen.cn为例,首页取出最新的几篇文章展示在列表,点列表可以展示文章的具体内容。




      


此demo的下载地址是:http://download.csdn.net/detail/ligl0702/7001333



0 0
原创粉丝点击