java编写网页爬虫(分页——插入数据库——导出数据)

来源:互联网 发布:java中decode函数 编辑:程序博客网 时间:2024/06/06 14:27
最近由于公司业务需要,需要从某网站爬取数据,在正则表达式上费了一番功夫,其他地方还算OK,这篇文章从网站爬取数据开始(分页的),到用jdbc插入MySQL数据库,然后从MySQL数据库中导出数据,一条龙!!!话不多说,直接上代码:

package com;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawlInfoUtil {
   
    publicstatic void main(String[] args) throws Exception{
      //1、趴取网页上信息,拼装List
       for(int i =1; i < 50; i++){
          StringBuffersb = newStringBuffer("http://www.chinaplasonline.com/ExhibitorList16/lang-simp/page-");
         sb.append(i).append("/src-12/s-cps11/od-0/hall-/asc-1/ChemicalZone.aspx");
         System.out.println("-----------第"+i+"页开始-----------");
         System.out.println("第"+i+"页URL:"+sb.toString());
          Stringcontent = httpRequest(sb.toString());
          List list =htmlFilter(content);
         //2、连接数据库
          Connectionconn = connectDataBase();
         //3、将拼装的数据存入数据库中
         insertData2DataBase(list, conn);
       }
    }
   

   
    privatestatic String httpRequest(String requestUrl) {

       StringBufferbuffer = null;
      BufferedReader bufferedReader = null;
      InputStreamReader inputStreamReader = null;
       InputStreaminputStream = null;
      HttpURLConnection httpUrlConn = null;

       try {
          //建立get请求
          URL url =new URL(requestUrl);
          httpUrlConn= (HttpURLConnection) url.openConnection();
         httpUrlConn.setDoInput(true);
         httpUrlConn.setRequestMethod("GET");

          //获取输入流
          inputStream= httpUrlConn.getInputStream();
         inputStreamReader = new InputStreamReader(inputStream,"utf-8");
         bufferedReader = new BufferedReader(inputStreamReader);

          //从输入流读取结果
          buffer = newStringBuffer();
          String str =null;
          while ((str= bufferedReader.readLine()) != null) {
            buffer.append(str);
          }

       } catch(Exception e) {
         e.printStackTrace();
       } finally{
          //释放资源
          if(bufferedReader != null) {
             try {
               bufferedReader.close();
             } catch(IOException e) {
               e.printStackTrace();
             }
          }
          if(inputStreamReader != null) {
             try {
               inputStreamReader.close();
             } catch(IOException e) {
               e.printStackTrace();
             }
          }
          if(inputStream != null) {
             try {
               inputStream.close();
             } catch(IOException e) {
               e.printStackTrace();
             }
          }
          if(httpUrlConn != null) {
            httpUrlConn.disconnect();
          }
       }
       returnbuffer.toString();
    }

   
    privatestatic List htmlFilter(String html){
       List list =new ArrayList();
      System.out.println("----------------开始解析-----------------");
       Pattern p =Pattern.compile("()(.*?)()");
       Matcher m =p.matcher(html);
       int i =0;
      while(m.find()){
         EnterprisInfoVo vo = new EnterprisInfoVo();
          StringinnerTR = m.group(2);
         //获取展商名称和URL
         //链接到新页面的
          PatterntdPattern = Pattern.compile("(.*)( 
)(.*?)()(.*?)()(.*)");
          MatchertdMatcher = tdPattern.matcher(innerTR);
          //弹出层的
          PatterntdPatternPop = Pattern.compile("(.*)()(.*)");
             Matcher m1 =p1.matcher(content);
            if(m1.matches()){
               vo.setMobile(getStringNotNullValue(m1.group(5)));
             }
            //email
             Pattern p2 =Pattern.compile("(.*)(电邮)(.*?)()(.*?)()(.*)");
             Matcher m2 =p2.matcher(content);
            if(m2.matches()){
               vo.setEmail(getStringNotNullValue(m2.group(5)));
             }
          //公司URL
                Pattern p3 = Pattern.compile("(.*)(网址)(.*?)(<a href=\")(.*?)(\" target=\"_blank\")(.*)");
                Matcher m3 = p3.matcher(content);
                if(m3.matches()){
                    vo.setUrl(getStringNotNullValue(m3.group(5)));
                }
                //标签
                Pattern p4 = Pattern.compile("(.*)(产品分类)(.*?)(-)(.*?)(</td>)(.*)");
             Matcher m4 =p4.matcher(content);
            if(m4.matches()){
               vo.setTags(getStringNotNullValue(m4.group(5)));
             }
            list.add(vo);
          }
          i++;
         System.out.println("解析第"+i+"条!!!");
       }
      System.out.println("------------------解析结束-------------------");
       returnlist;
    }

    privatestatic void insertData2DataBase(Listlist, Connection conn)throwsException{
      System.out.println("待插入的数据条数:" + list.size());
       String sql ="insert into test_enterprise_info(enterprise_name,url,email,mobile,tags) values (?,?,?,?,?)";
      PreparedStatement stmt = conn.prepareStatement(sql);
      for(EnterprisInfoVo vo:list){
         stmt.setString(1, vo.getName());
         stmt.setString(2, vo.getUrl());
         stmt.setString(3, vo.getEmail());
         stmt.setString(4, vo.getMobile());
         stmt.setString(5, vo.getTags());
         stmt.addBatch();
       }
      stmt.executeBatch();
      System.out.println("数据插入完毕!!!");
    }
   
    publicstatic Connection connectDataBase() throws SQLException{
       Connectionconn = null;
       String url = "jdbc:mysql://localhost:3306/smartpr?"
               +"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
       try {
           Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
           System.out.println("成功加载MySQL驱动程序");
           conn = DriverManager.getConnection(url);
       } catch (SQLException e) {
           System.out.println("MySQL操作错误");
           e.printStackTrace();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return conn;
    }
   
    publicstatic String getStringNotNullValue(Object object) {
       if (object== null) {
          return"";
       } else{
          returnobject.toString().trim();
       }
    }
}
以上是把网页上的数据插入到数据库的过程,这里需要注意一下java的正则表达式,要注意find()方法和matches方法的区别,find()方法是会自动将符合条件的目标移动到下一个,并且是按照部分匹配的,注意我的代码中matches的正则开始和结尾都用到了(.*),而find()方法没有用;matches()方法刚好相反;

下面把数据库中的数据导入到桌面上:
package com;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.Region;

import com.mysql.jdbc.PreparedStatement;

public class ExcelTest {
   
   
    publicstatic void main(String[] args) throws Exception{
      //1、连接数据库
       Connectionconn = connectDataBase();
      //2、从数据库中取数据
       Listlist =getDataFromDataBase(conn);
      //3、把数据插入excel表中
      insertData2Excel(list);
    }
   
    privatestatic ListgetDataFromDataBase(Connection conn) throwsException{
       String sql ="select * from test_enterprise_info";
      PreparedStatement stmt = (PreparedStatement)conn.prepareStatement(sql);
       ResultSet rs= stmt.executeQuery();
       Listlist =new ArrayList();
      while(rs.next()){
         EnterprisInfoVo vo = new EnterprisInfoVo();
          String name= rs.getString(2);
          String url =rs.getString(3);
          String email= rs.getString(4);
          Stringmobile = rs.getString(5);
          String tags= rs.getString(6);
         vo.setEmail(email);
         vo.setMobile(mobile);
         vo.setName(name);
         vo.setTags(getStringNotNullValue(tags).replace("
", ""));
         vo.setUrl(url);
         list.add(vo);
       }
       returnlist;
    }
   
   @SuppressWarnings("deprecation")
    publicstatic void insertData2Excel(Listlist){
       //声明一个工作薄
       HSSFWorkbook wb = new HSSFWorkbook();
       //声明一个单子并命名
       HSSFSheet sheet = wb.createSheet("企业信息表");
       //给单子名称一个长度
       sheet.setDefaultColumnWidth((short)15);
       // 生成一个样式 
       HSSFCellStyle style = wb.createCellStyle();
       //创建第一行(也可以称为表头)
       sheet.addMergedRegion(new Region(0, (short)0, 0, (short)4));
       HSSFRow row0 = sheet.createRow(0);
       HSSFCell cell0 = row0.createCell(0);
       row0.setHeight((short)500);
       HSSFCellStyle style0 = wb.createCellStyle();
       row0.setRowStyle(style0);
       //设置字体
       HSSFFont font = wb.createFont();
       style0.setFont(font);
       font.setFontHeightInPoints((short)14);
       font.setColor(HSSFFont.BOLDWEIGHT_BOLD);
       //单元格内容
       cell0.setCellValue("企业信息表");
       HSSFRow row = sheet.createRow(1);
       //样式字体居中
       style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
       //给表头第一行一次创建单元格
       HSSFCell cell = row.createCell((short) 0);
       cell.setCellValue("企业名称");
       cell.setCellStyle(style);
       cell = row.createCell( (short) 1); 
       cell.setCellValue("企业网站URL"); 
       cell.setCellStyle(style); 
       cell = row.createCell((short) 2); 
       cell.setCellValue("企业email"); 
       cell.setCellStyle(style);
       cell = row.createCell((short) 3); 
       cell.setCellValue("企业联系电话"); 
       cell.setCellStyle(style);
       cell = row.createCell((short) 4); 
       cell.setCellValue("企业标签"); 
       cell.setCellStyle(style);
      //向单元格里填充数据
      for (int i = 0; i < list.size(); i++) {
           row = sheet.createRow(i + 2);
           EnterprisInfoVo vo = list.get(i);
           row.createCell(0).setCellValue(vo.getName());
           row.createCell(1).setCellValue(vo.getUrl());
           row.createCell(2).setCellValue(vo.getEmail());
           row.createCell(3).setCellValue(vo.getMobile());
           row.createCell(4).setCellValue(vo.getTags());
       }
        
      try {
           //默认导出到E盘下
           FileOutputStream out = newFileOutputStream("C://Users//Administrator//Desktop/EnterpriseInfo.xls");
           wb.write(out);
           out.close();
       } catch (FileNotFoundException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
    }
   
   
    publicstatic Connection connectDataBase() throws SQLException{
       Connectionconn = null;
       String url = "jdbc:mysql://localhost:3306/smartpr?"
               +"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
       try {
           Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
           System.out.println("成功加载MySQL驱动程序");
           conn = DriverManager.getConnection(url);
       } catch (SQLException e) {
           System.out.println("MySQL操作错误");
           e.printStackTrace();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return conn;
    }
   
    publicstatic String getStringNotNullValue(Object object) {
       if (object== null) {
          return"";
       } else{
          returnobject.toString().trim();
       }
    }
}








 
1 0
原创粉丝点击