Java网络爬虫基础和抓取网站数据的两个小实例
来源:互联网 发布:js 视频播放插件 编辑:程序博客网 时间:2024/06/05 19:33
前段时间在学习爬虫,并从网络抓取了一些简单的数据,记录一下。
抓取分成下面3个部分:
1、网络请求
2、解析抓取下来的页面,并且处理乱码或者解压代码的问题
3、拿到指定的数据、资源
完整代码如下:
第一个实例:
-
-
-
-
- public static Map<String, String> parseClPage(){
- String html = "http://cl.xxxx/thread0806.php"; // 解析的网站域名
- String currentuserdesktop = System.getProperty("user.home")+"\\Desktop";
- Map<String, String> resultMap = new TreeMap<String, String>();
- Document doc = null;
- try {
- for (int i = 0; i < 199; i++) {
- StringBuffer htmlCode = new StringBuffer("");
- HttpMethod httpMethod = new GetMethod("http://cl.xxxx/thread0806.php?fid=7&search=&page="+(i+1));
- HttpClient client = new HttpClient();
- httpMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
- httpMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
- httpMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
- httpMethod.addRequestHeader("Referer", "http://cl.clvv.biz/thread0806.php?fid=7");
- httpMethod.addRequestHeader("HTTPS", "1");
- httpMethod.addRequestHeader("Connection", "keep-alive");
- httpMethod.addRequestHeader("Host", "cl.clvv.biz");
- httpMethod.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
- client.setTimeout(3000);
- client.executeMethod(httpMethod);
- InputStream inputStream = httpMethod.getResponseBodyAsStream();
- GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
- InputStreamReader inputStreamReader = new InputStreamReader(gzipInputStream,Charset.forName("gb2312"));
- BufferedReader bin21 = new BufferedReader(inputStreamReader);
- while(bin21.readLine()!=null){
- String line = bin21.readLine();
- htmlCode.append(line);
- }
- doc = Jsoup.parse(htmlCode.toString());
- Elements elementsTr = doc.select("table tr");
- for (Element element : elementsTr) {
- String title = element.select("td").eq(1).select("h3 a").text();
- if(null!=title && !"".equals(title)){
- String link = "http://cl.xxxx/"+element.select("td").eq(1).select("h3 a").attr("href");
-
- writefiletotxt((new FileWriter(currentuserdesktop+"\\查找结果.txt",true)),("标题:"+title+"\t链接:"+link+"\r\n"));
- }
- }
-
- httpMethod.abort();
- httpMethod.releaseConnection();
- }
- System.out.println("done--");
- } catch (Exception e) {
- e.printStackTrace();
- }
- return resultMap;
- }
- public static void writefiletotxt(FileWriter fw,String result){
- try {
- fw.write(result);
- fw.flush();
- fw.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
第二个实例:抓取网站图片。思路和第一个差不多
-
-
-
-
- public class CatchImages {
-
- private static String curdesktop = System.getProperty("user.home")+"\\Desktop\\CatchImages\\";
-
- public static void main(String[] args) {
- doCatch("http://item.jd.com/716240.html");
- }
-
-
- public static Integer doCatch(String site){
- GetMethod method = new GetMethod(site);
- HttpClient client = new HttpClient();
-
- try {
- method.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
-
- method.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
- method.addRequestHeader("Avail-Dictionary", "XprLfaXG");
- method.addRequestHeader("Cache-Control", "max-age=0");
- method.addRequestHeader("Connection", "keep-alive");
- method.addRequestHeader("Cookie", "");
- method.addRequestHeader("Host", "user.qzone.qq.com");
- method.addRequestHeader("If-Modified-Since", "Thu, 24 Sep 2015 02:55:30 GMT");
- method.addRequestHeader("Upgrade-Insecure-Requests", "1");
- method.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
-
- client.executeMethod(method);
- String htmlCode = method.getResponseBodyAsString();
-
- Document doc = Jsoup.parse(htmlCode);
- Elements elementImg = doc.select("body img");
- for (Element element : elementImg) {
- String src = element.attr("src");
- if(src.contains("http")){
-
- }else {
- String rootUrl = HTMLParserHelper.getRootUrl(site);
-
- src = rootUrl+src;
- }
- System.out.println(src);
- downloadImage(src);
- System.out.println("ok");
- }
- System.out.println(elementImg.size()+" result catched.");
- } catch (Exception e) {
- e.printStackTrace();
- } finally{
- method.abort();
- method.releaseConnection();
- }
- return 0;
- }
-
-
- public static void downloadImage(String imageUrl){
- GetMethod method = new GetMethod(imageUrl);
- HttpClient client = new HttpClient();
- try {
- client.executeMethod(method);
- InputStream inputStream = method.getResponseBodyAsStream();
-
- File file = new File(curdesktop);
- if(!file.exists()){
- try {
- file.mkdir();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- byte b[] = {1};
- int size = 0;
- FileOutputStream outputStream = new FileOutputStream(new File(curdesktop+HTMLParserHelper.getImageNameAndHouzui(imageUrl)));
- while((size=inputStream.read(b))!=-1){
- outputStream.write(b, 0, size);
- }
- outputStream.close();
- } catch (Exception e) {
- e.printStackTrace();
- } finally{
-
- method.abort();
- method.releaseConnection();
- }
-
- }
- }