java编写网页爬虫(分页——插入数据库——导出数据)
来源:互联网 发布:java中decode函数 编辑:程序博客网 时间:2024/06/06 14:27
最近由于公司业务需要,需要从某网站爬取数据,在正则表达式上费了一番功夫,其他地方还算OK,这篇文章从网站爬取数据开始(分页的),到用jdbc插入MySQL数据库,然后从MySQL数据库中导出数据,一条龙!!!话不多说,直接上代码:
package com;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlInfoUtil {
publicstatic void main(String[] args) throws Exception{
//1、趴取网页上信息,拼装List
for(int i =1; i < 50; i++){
StringBuffersb = newStringBuffer("http://www.chinaplasonline.com/ExhibitorList16/lang-simp/page-");
sb.append(i).append("/src-12/s-cps11/od-0/hall-/asc-1/ChemicalZone.aspx");
System.out.println("-----------第"+i+"页开始-----------");
System.out.println("第"+i+"页URL:"+sb.toString());
Stringcontent = httpRequest(sb.toString());
List list =htmlFilter(content);
//2、连接数据库
Connectionconn = connectDataBase();
//3、将拼装的数据存入数据库中
insertData2DataBase(list, conn);
}
}
privatestatic String httpRequest(String requestUrl) {
StringBufferbuffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStreaminputStream = null;
HttpURLConnection httpUrlConn = null;
try {
//建立get请求
URL url =new URL(requestUrl);
httpUrlConn= (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
//获取输入流
inputStream= httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream,"utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
//从输入流读取结果
buffer = newStringBuffer();
String str =null;
while ((str= bufferedReader.readLine()) != null) {
buffer.append(str);
}
} catch(Exception e) {
e.printStackTrace();
} finally{
//释放资源
if(bufferedReader != null) {
try {
bufferedReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStreamReader != null) {
try {
inputStreamReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStream != null) {
try {
inputStream.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
returnbuffer.toString();
}
privatestatic List htmlFilter(String html){
List list =new ArrayList();
System.out.println("----------------开始解析-----------------");
Pattern p =Pattern.compile("()(.*?)()");
Matcher m =p.matcher(html);
int i =0;
while(m.find()){
EnterprisInfoVo vo = new EnterprisInfoVo();
StringinnerTR = m.group(2);
//获取展商名称和URL
//链接到新页面的
PatterntdPattern = Pattern.compile("(.*)(
package com;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlInfoUtil {
publicstatic void main(String[] args) throws Exception{
//1、趴取网页上信息,拼装List
for(int i =1; i < 50; i++){
StringBuffersb = newStringBuffer("http://www.chinaplasonline.com/ExhibitorList16/lang-simp/page-");
sb.append(i).append("/src-12/s-cps11/od-0/hall-/asc-1/ChemicalZone.aspx");
System.out.println("-----------第"+i+"页开始-----------");
System.out.println("第"+i+"页URL:"+sb.toString());
Stringcontent = httpRequest(sb.toString());
List list =htmlFilter(content);
//2、连接数据库
Connectionconn = connectDataBase();
//3、将拼装的数据存入数据库中
insertData2DataBase(list, conn);
}
}
privatestatic String httpRequest(String requestUrl) {
StringBufferbuffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStreaminputStream = null;
HttpURLConnection httpUrlConn = null;
try {
//建立get请求
URL url =new URL(requestUrl);
httpUrlConn= (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
//获取输入流
inputStream= httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream,"utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
//从输入流读取结果
buffer = newStringBuffer();
String str =null;
while ((str= bufferedReader.readLine()) != null) {
buffer.append(str);
}
} catch(Exception e) {
e.printStackTrace();
} finally{
//释放资源
if(bufferedReader != null) {
try {
bufferedReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStreamReader != null) {
try {
inputStreamReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStream != null) {
try {
inputStream.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
returnbuffer.toString();
}
privatestatic List htmlFilter(String html){
List list =new ArrayList();
System.out.println("----------------开始解析-----------------");
Pattern p =Pattern.compile("()(.*?)()");
Matcher m =p.matcher(html);
int i =0;
while(m.find()){
EnterprisInfoVo vo = new EnterprisInfoVo();
StringinnerTR = m.group(2);
//获取展商名称和URL
//链接到新页面的
PatterntdPattern = Pattern.compile("(.*)(
)(.*?)()(.*?)()(.*)");
MatchertdMatcher = tdPattern.matcher(innerTR);
//弹出层的
PatterntdPatternPop = Pattern.compile("(.*)()(.*)");
Matcher m1 =p1.matcher(content);
if(m1.matches()){
vo.setMobile(getStringNotNullValue(m1.group(5)));
}
//email
Pattern p2 =Pattern.compile("(.*)(电邮)(.*?)()(.*?)()(.*)");
Matcher m2 =p2.matcher(content);
if(m2.matches()){
vo.setEmail(getStringNotNullValue(m2.group(5)));
}
//公司URL
Pattern p3 = Pattern.compile("(.*)(网址)(.*?)(<a href=\")(.*?)(\" target=\"_blank\")(.*)");
Matcher m3 = p3.matcher(content);
if(m3.matches()){
vo.setUrl(getStringNotNullValue(m3.group(5)));
}
//标签
Pattern p4 = Pattern.compile("(.*)(产品分类)(.*?)(-)(.*?)(</td>)(.*)");
Matcher m4 =p4.matcher(content);
if(m4.matches()){
vo.setTags(getStringNotNullValue(m4.group(5)));
}
list.add(vo);
}
i++;
System.out.println("解析第"+i+"条!!!");
}
System.out.println("------------------解析结束-------------------");
returnlist;
}
privatestatic void insertData2DataBase(Listlist, Connection conn)throwsException{
System.out.println("待插入的数据条数:" + list.size());
String sql ="insert into test_enterprise_info(enterprise_name,url,email,mobile,tags) values (?,?,?,?,?)";
PreparedStatement stmt = conn.prepareStatement(sql);
for(EnterprisInfoVo vo:list){
stmt.setString(1, vo.getName());
stmt.setString(2, vo.getUrl());
stmt.setString(3, vo.getEmail());
stmt.setString(4, vo.getMobile());
stmt.setString(5, vo.getTags());
stmt.addBatch();
}
stmt.executeBatch();
System.out.println("数据插入完毕!!!");
}
publicstatic Connection connectDataBase() throws SQLException{
Connectionconn = null;
String url = "jdbc:mysql://localhost:3306/smartpr?"
+"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
try {
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
System.out.println("成功加载MySQL驱动程序");
conn = DriverManager.getConnection(url);
} catch (SQLException e) {
System.out.println("MySQL操作错误");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
publicstatic String getStringNotNullValue(Object object) {
if (object== null) {
return"";
} else{
returnobject.toString().trim();
}
}
}
以上是把网页上的数据插入到数据库的过程,这里需要注意一下java的正则表达式,要注意find()方法和matches方法的区别,find()方法是会自动将符合条件的目标移动到下一个,并且是按照部分匹配的,注意我的代码中matches的正则开始和结尾都用到了(.*),而find()方法没有用;matches()方法刚好相反;
下面把数据库中的数据导入到桌面上:
package com;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.Region;
import com.mysql.jdbc.PreparedStatement;
public class ExcelTest {
publicstatic void main(String[] args) throws Exception{
//1、连接数据库
Connectionconn = connectDataBase();
//2、从数据库中取数据
Listlist =getDataFromDataBase(conn);
//3、把数据插入excel表中
insertData2Excel(list);
}
privatestatic ListgetDataFromDataBase(Connection conn) throwsException{
String sql ="select * from test_enterprise_info";
PreparedStatement stmt = (PreparedStatement)conn.prepareStatement(sql);
ResultSet rs= stmt.executeQuery();
Listlist =new ArrayList();
while(rs.next()){
EnterprisInfoVo vo = new EnterprisInfoVo();
String name= rs.getString(2);
String url =rs.getString(3);
String email= rs.getString(4);
Stringmobile = rs.getString(5);
String tags= rs.getString(6);
vo.setEmail(email);
vo.setMobile(mobile);
vo.setName(name);
vo.setTags(getStringNotNullValue(tags).replace("
", ""));
vo.setUrl(url);
list.add(vo);
}
returnlist;
}
@SuppressWarnings("deprecation")
publicstatic void insertData2Excel(Listlist){
//声明一个工作薄
HSSFWorkbook wb = new HSSFWorkbook();
//声明一个单子并命名
HSSFSheet sheet = wb.createSheet("企业信息表");
//给单子名称一个长度
sheet.setDefaultColumnWidth((short)15);
// 生成一个样式
HSSFCellStyle style = wb.createCellStyle();
//创建第一行(也可以称为表头)
sheet.addMergedRegion(new Region(0, (short)0, 0, (short)4));
HSSFRow row0 = sheet.createRow(0);
HSSFCell cell0 = row0.createCell(0);
row0.setHeight((short)500);
HSSFCellStyle style0 = wb.createCellStyle();
row0.setRowStyle(style0);
//设置字体
HSSFFont font = wb.createFont();
style0.setFont(font);
font.setFontHeightInPoints((short)14);
font.setColor(HSSFFont.BOLDWEIGHT_BOLD);
//单元格内容
cell0.setCellValue("企业信息表");
HSSFRow row = sheet.createRow(1);
//样式字体居中
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
//给表头第一行一次创建单元格
HSSFCell cell = row.createCell((short) 0);
cell.setCellValue("企业名称");
cell.setCellStyle(style);
cell = row.createCell( (short) 1);
cell.setCellValue("企业网站URL");
cell.setCellStyle(style);
cell = row.createCell((short) 2);
cell.setCellValue("企业email");
cell.setCellStyle(style);
cell = row.createCell((short) 3);
cell.setCellValue("企业联系电话");
cell.setCellStyle(style);
cell = row.createCell((short) 4);
cell.setCellValue("企业标签");
cell.setCellStyle(style);
//向单元格里填充数据
for (int i = 0; i < list.size(); i++) {
row = sheet.createRow(i + 2);
EnterprisInfoVo vo = list.get(i);
row.createCell(0).setCellValue(vo.getName());
row.createCell(1).setCellValue(vo.getUrl());
row.createCell(2).setCellValue(vo.getEmail());
row.createCell(3).setCellValue(vo.getMobile());
row.createCell(4).setCellValue(vo.getTags());
}
try {
//默认导出到E盘下
FileOutputStream out = newFileOutputStream("C://Users//Administrator//Desktop/EnterpriseInfo.xls");
wb.write(out);
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
publicstatic Connection connectDataBase() throws SQLException{
Connectionconn = null;
String url = "jdbc:mysql://localhost:3306/smartpr?"
+"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
try {
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
System.out.println("成功加载MySQL驱动程序");
conn = DriverManager.getConnection(url);
} catch (SQLException e) {
System.out.println("MySQL操作错误");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
publicstatic String getStringNotNullValue(Object object) {
if (object== null) {
return"";
} else{
returnobject.toString().trim();
}
}
}
MatchertdMatcher = tdPattern.matcher(innerTR);
//弹出层的
PatterntdPatternPop = Pattern.compile("(.*)()(.*)");
Matcher m1 =p1.matcher(content);
if(m1.matches()){
vo.setMobile(getStringNotNullValue(m1.group(5)));
}
Pattern p2 =Pattern.compile("(.*)(电邮)(.*?)()(.*?)()(.*)");
Matcher m2 =p2.matcher(content);
if(m2.matches()){
vo.setEmail(getStringNotNullValue(m2.group(5)));
}
//公司URL
Pattern p3 = Pattern.compile("(.*)(网址)(.*?)(<a href=\")(.*?)(\" target=\"_blank\")(.*)");
Matcher m3 = p3.matcher(content);
if(m3.matches()){
vo.setUrl(getStringNotNullValue(m3.group(5)));
}
//标签
Pattern p4 = Pattern.compile("(.*)(产品分类)(.*?)(-)(.*?)(</td>)(.*)");
Matcher m4 =p4.matcher(content);
if(m4.matches()){
vo.setTags(getStringNotNullValue(m4.group(5)));
}
list.add(vo);
}
i++;
System.out.println("解析第"+i+"条!!!");
}
System.out.println("------------------解析结束-------------------");
returnlist;
}
privatestatic void insertData2DataBase(Listlist, Connection conn)throwsException{
System.out.println("待插入的数据条数:" + list.size());
String sql ="insert into test_enterprise_info(enterprise_name,url,email,mobile,tags) values (?,?,?,?,?)";
PreparedStatement stmt = conn.prepareStatement(sql);
for(EnterprisInfoVo vo:list){
stmt.setString(1, vo.getName());
stmt.setString(2, vo.getUrl());
stmt.setString(3, vo.getEmail());
stmt.setString(4, vo.getMobile());
stmt.setString(5, vo.getTags());
stmt.addBatch();
}
stmt.executeBatch();
System.out.println("数据插入完毕!!!");
}
publicstatic Connection connectDataBase() throws SQLException{
Connectionconn = null;
String url = "jdbc:mysql://localhost:3306/smartpr?"
+"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
try {
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
System.out.println("成功加载MySQL驱动程序");
conn = DriverManager.getConnection(url);
} catch (SQLException e) {
System.out.println("MySQL操作错误");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
publicstatic String getStringNotNullValue(Object object) {
if (object== null) {
return"";
} else{
returnobject.toString().trim();
}
}
}
以上是把网页上的数据插入到数据库的过程,这里需要注意一下java的正则表达式,要注意find()方法和matches方法的区别,find()方法是会自动将符合条件的目标移动到下一个,并且是按照部分匹配的,注意我的代码中matches的正则开始和结尾都用到了(.*),而find()方法没有用;matches()方法刚好相反;
下面把数据库中的数据导入到桌面上:
package com;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.Region;
import com.mysql.jdbc.PreparedStatement;
public class ExcelTest {
publicstatic void main(String[] args) throws Exception{
//1、连接数据库
Connectionconn = connectDataBase();
//2、从数据库中取数据
Listlist =getDataFromDataBase(conn);
//3、把数据插入excel表中
insertData2Excel(list);
}
privatestatic ListgetDataFromDataBase(Connection conn) throwsException{
String sql ="select * from test_enterprise_info";
PreparedStatement stmt = (PreparedStatement)conn.prepareStatement(sql);
ResultSet rs= stmt.executeQuery();
Listlist =new ArrayList();
while(rs.next()){
EnterprisInfoVo vo = new EnterprisInfoVo();
String name= rs.getString(2);
String url =rs.getString(3);
String email= rs.getString(4);
Stringmobile = rs.getString(5);
String tags= rs.getString(6);
vo.setEmail(email);
vo.setMobile(mobile);
vo.setName(name);
vo.setTags(getStringNotNullValue(tags).replace("
", ""));
vo.setUrl(url);
list.add(vo);
}
returnlist;
}
@SuppressWarnings("deprecation")
publicstatic void insertData2Excel(Listlist){
//声明一个工作薄
HSSFWorkbook wb = new HSSFWorkbook();
//声明一个单子并命名
HSSFSheet sheet = wb.createSheet("企业信息表");
//给单子名称一个长度
sheet.setDefaultColumnWidth((short)15);
// 生成一个样式
HSSFCellStyle style = wb.createCellStyle();
//创建第一行(也可以称为表头)
sheet.addMergedRegion(new Region(0, (short)0, 0, (short)4));
HSSFRow row0 = sheet.createRow(0);
HSSFCell cell0 = row0.createCell(0);
row0.setHeight((short)500);
HSSFCellStyle style0 = wb.createCellStyle();
row0.setRowStyle(style0);
//设置字体
HSSFFont font = wb.createFont();
style0.setFont(font);
font.setFontHeightInPoints((short)14);
font.setColor(HSSFFont.BOLDWEIGHT_BOLD);
//单元格内容
cell0.setCellValue("企业信息表");
HSSFRow row = sheet.createRow(1);
//样式字体居中
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
//给表头第一行一次创建单元格
HSSFCell cell = row.createCell((short) 0);
cell.setCellValue("企业名称");
cell.setCellStyle(style);
cell = row.createCell( (short) 1);
cell.setCellValue("企业网站URL");
cell.setCellStyle(style);
cell = row.createCell((short) 2);
cell.setCellValue("企业email");
cell.setCellStyle(style);
cell = row.createCell((short) 3);
cell.setCellValue("企业联系电话");
cell.setCellStyle(style);
cell = row.createCell((short) 4);
cell.setCellValue("企业标签");
cell.setCellStyle(style);
//向单元格里填充数据
for (int i = 0; i < list.size(); i++) {
row = sheet.createRow(i + 2);
EnterprisInfoVo vo = list.get(i);
row.createCell(0).setCellValue(vo.getName());
row.createCell(1).setCellValue(vo.getUrl());
row.createCell(2).setCellValue(vo.getEmail());
row.createCell(3).setCellValue(vo.getMobile());
row.createCell(4).setCellValue(vo.getTags());
}
try {
//默认导出到E盘下
FileOutputStream out = newFileOutputStream("C://Users//Administrator//Desktop/EnterpriseInfo.xls");
wb.write(out);
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
publicstatic Connection connectDataBase() throws SQLException{
Connectionconn = null;
String url = "jdbc:mysql://localhost:3306/smartpr?"
+"user=root&password=root&useUnicode=true&characterEncoding=UTF8";
try {
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
System.out.println("成功加载MySQL驱动程序");
conn = DriverManager.getConnection(url);
} catch (SQLException e) {
System.out.println("MySQL操作错误");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return conn;
}
publicstatic String getStringNotNullValue(Object object) {
if (object== null) {
return"";
} else{
returnobject.toString().trim();
}
}
}
1 0
- java编写网页爬虫(分页——插入数据库——导出数据)
- java编写网页爬虫(分页——插入数据…
- java网页爬虫——邮箱
- Java爬虫——抓取静态网页
- Java豆瓣电影爬虫——减少与数据库交互实现批量插入
- Python3 大型网络爬虫实战 004 — scrapy 大型静态商城网站爬虫项目编写及数据写入数据库实战 — 实战:爬取淘宝
- 正则表达式—网页爬虫
- 连接数据库,分页类,插入类,删除类,导出类,导入类,计数类的编写
- php数据库——查询分页数据
- 读取数据库数据——分页展示
- JAVA实现网页爬虫及将数据写入数据库
- 自己写网页爬虫——网页分类抓取/采集并导入数据库
- Java爬虫——phantomjs抓取ajax动态加载网页
- Java爬虫——phantomjs抓取ajax动态加载网页
- 爬虫记录(6)——爬虫实战:爬取知乎网站内容,保存到数据库,并导出到Excel
- Scrapy爬虫(三)——简易动态网页爬虫
- python爬虫——写出最简单的网页爬虫
- Java Web—数据库分页操作
- Android使用Cordova框架开发Android Hybrid App
- Best Time to Buy and Sell Stock III
- (二)Hyper如何对DataBlock中的数据进行索引
- 将pgm图片使用Sequence file存储到hdfs,并读取为Mat数组格式
- 后缀数组模板
- java编写网页爬虫(分页——插入数据库——导出数据)
- Sublime Text 3 常用插件以及安装方法
- 定制自己的报表!7款实用开源报表工具
- UIAutomator定位Android控件的方法实践和建议(Appium姊妹篇)
- Java经典类库-Guava中的函数式编程讲解
- LeetCode 142. Linked List Cycle II(循环链表)
- Android使用ActionBar搭建Android UI框架
- iOS压缩图片
- 二项式定理