java小爬虫智联保存到excel

来源：互联网发布：生命世界网络构图编辑：程序博客网时间：2024/04/29 23:36
import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.ss.usermodel.Cell;import org.apache.poi.ss.usermodel.CreationHelper;import org.apache.poi.ss.usermodel.Row;import org.apache.poi.ss.usermodel.Sheet;import org.apache.poi.ss.usermodel.Workbook;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.junit.Test;public class Jll2 {/** *  *<p>Title:智联招聘网站爬虫小案例</p> *<p>Description:</p> *<p>Company:</p> *@author wty * @throws IOException  *@date 2017年7月8日上午11:28:01 * */    public static String getHtmlCode(String url, String encoding) throws IOException {            URL uri =null;        URLConnection urlConnection =null;        InputStream inputStream =null;        InputStreamReader inputStreamReader = null;        BufferedReader bReader =null;        StringBuffer sBuffer= new StringBuffer();        try {            // 建立网络连接             uri = new URL(url);            // 打开连接             urlConnection = uri.openConnection();            //输入流             inputStream = urlConnection.getInputStream();             inputStreamReader = new InputStreamReader(inputStream, encoding);             bReader = new BufferedReader(inputStreamReader);             String temp;             while ((temp = bReader.readLine()) != null) {                sBuffer.append(temp + "\n");             }        } catch (Exception e) {            e.printStackTrace();        }finally {            //关闭资源            if(bReader!=null){                try {                    bReader.close();                } catch (IOException e) {                    e.printStackTrace();                }            }        }        return sBuffer.toString();}    public static List<HashMap<String, String>> analyzeHtml(String url, String encoding) throws IOException{        String htmlCode = getHtmlCode(url, encoding);        Document document = Jsoup.parse(htmlCode);        Elements elements = document.getElementsByClass("newlist");        List<HashMap<String, String>> list=new ArrayList<>();        for (Element e : elements) {            HashMap<String, String> map = new HashMap<>();            String zwmc = e.getElementsByClass("zwmc").text();            String gsmc = e.getElementsByClass("gsmc").text();            String zwyx = e.getElementsByClass("zwyx").text();            String gzdd = e.getElementsByClass("gzdd").text();            String gxsj = e.getElementsByClass("gxsj").text();            map.put("职位名称", zwmc);            map.put("公司名称", gsmc);            map.put("职位月薪", zwyx);            map.put("工作地点", gzdd);            map.put("发布日期", gxsj);            list.add(map);        }        return list;    }    public static void main(String[] args) throws IOException {    Workbook wb = new HSSFWorkbook();FileOutputStream fileOut = new FileOutputStream("E:\\fiveMonth\\poi\\txts2.xls");Sheet sheet = wb.createSheet("java");        List<HashMap<String, String>> resultList = analyzeHtml("http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=java&p=1&isadv=0", "UTF-8");        for (int i = 0; i < resultList.size(); i++) {Row row = sheet.createRow(i);row.createCell(0).setCellValue(resultList.get(i).get("职位名称"));row.createCell(1).setCellValue(resultList.get(i).get("公司名称"));row.createCell(2).setCellValue(resultList.get(i).get("职位月薪"));row.createCell(3).setCellValue(resultList.get(i).get("工作地点"));row.createCell(4).setCellValue(resultList.get(i).get("发布日期"));}        wb.write(fileOut);fileOut.close();    }}
阅读全文
0 0