接触一下爬虫

来源:互联网 发布:加工中心手动编程图纸 编辑:程序博客网 时间:2024/05/22 15:36

1.获取baidu图片链接
首先将http响应内容测试获取到

public class baidu {    public static void main(String[] args) {        //定义即将访问的链接        String url = "http://www.baidu.com";        //定义一个字符串用来存储网页内容        String result = "";        //定义一个缓冲字符输入流        BufferedReader in = null;        BufferedWriter out = null;        try {            //将string转成url对象            URL realUrl = new URL(url);            //初始化一个链接到那个url的链接            URLConnection connection = realUrl.openConnection();            //开始实际的链接            connection.connect();            //初始化BufferedReader输入流来读取URL的响应            in = new BufferedReader(new InputStreamReader(connection.getInputStream()));            out = new BufferedWriter(new FileWriter("1.txt"));            //用来临时存储抓取到的每一行数据            String line;            while ((line = in.readLine())!=null)            {                //遍历抓取到的每一行并将其存储到result里面                result += line + "\n";                System.out.println(result);                out.write(result);                out.newLine();                out.flush();            }        }catch (Exception e)        {            System.out.println("发送GET请求出现异常"+e);            e.printStackTrace();        }    }}

接下来使用正则表达式来获取响应内容中的必要信息。
这里写图片描述
这是一部分信息(这部分是获取百度图片链接的)

import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.PasswordAuthentication;import java.net.URL;import java.net.URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by dell on 2016/7/29. * 获取百度图片链接 */public class Get_Baidu_1 {    public static String sendGet(String url)    {        //定义一个字符串用来存储网页内容        String result = "";        //定义一个缓冲字符输入流        BufferedReader in = null;        try {            //将String 转为 URL            URL realUrl = new URL(url);            //初始化一个链接到那个url的连接            URLConnection connection = realUrl.openConnection();            //开始实际的连接            connection.connect();            //初始化BufferedReader输入流来读取URL的响应            in = new BufferedReader(new InputStreamReader(connection.getInputStream()));            //用来零时存储抓取到的每一行数据            String line;            while ((line = in.readLine())!= null)            {                //遍历抓取到的每一行放到 result                result += line;            }        }catch (Exception e)        {            System.out.println("GET失败");        }finally {            try {                if (in != null)                    in.close();            }catch (Exception e)            {                System.out.println("关闭err");            }            return result;        }    }    public static String RegxString(String targetSrc, String patternSrc)    {        //定义一个样式模板,此中使用正则表达式,括号中要抓的内容        //相当于埋好了陷阱匹配的地方就会掉进去        Pattern pattern = Pattern.compile(patternSrc);        //定义一个Matcher来进行匹配        Matcher matcher = pattern.matcher(targetSrc);        //如果找到了        if(matcher.find())        {            return matcher.group();        }        return "Nothing";    }    public static void main(String[] args) {        //定义即将访问的链接        String url = "http://www.baidu.com";        //访问链接并获取页面内容        String result = sendGet(url);        System.out.println(result);        String imgSrc = RegxString(result,"src=\\/\\/(\\w{3}).(\\w+).(\\w+)\\/(\\w+)\\/([a-zA-Z_]+).(\\w+)");        System.out.println("image = "+imgSrc);    }}

这样就完成百度链接的获取

1 0
原创粉丝点击