抽取文本中的日期和时间

来源:互联网 发布:淘宝小号 浮云网 编辑:程序博客网 时间:2024/06/16 06:28

写这种算法真的是累啊,我是没辙,只能拿正则了,各位仁兄有何高见?

测试效果:
这里写图片描述

public class KeyWordsTest {    String [] input = {            "2017年9月16号",            "2017年10月3日",            "2015年7月",            "9月17号",            "4月15",            "6号",            "星期3",            "下礼拜5",            "本周2",            "周末",            "下周星期5",            "星期天",            "2015-9-7",            "9-7",            "2019.9.8",            "2019.8",            "9.7",            "二零一六年九月十八号",            "三月七号",            "十月二十",            "星期六",            "下礼拜五,我打算回上海",            "今天十五号,明天十六号",            "十月一号国庆节,我们打算回家",            "下午12:56",            "9:30",            "09:31",            "上午七点",            "下午5点",            "十点半",            "今天晚上九点15",            "十点一刻",            "二十三点50",            "十一点",            "以下是2017年4月10日入住,五道口附近的快捷酒店信息(与用户意图最相关的5个快捷酒店信息),请问您想预定哪一个?",            "5"//不能匹配    };    @Test    public void testDate() {        for (int i = 0; i < input.length; i++) {            System.out.print(input[i]+": ");            System.out.println(TimeWords.words(input[i]));                  }           }}
public class TimeWords {    private static String regex_date;    private static String regex_time;    private static final String L = "(";    private static final String R = ")";    private static final String O = "|";    static{        createRegexDate();        createRegexTime();    }    /***     *  截取字符串里的日期     * @param input     * @return     */    private static String[] get(String regex,String input){        Pattern pattern = Pattern.compile(regex);        Matcher matcher = pattern.matcher(input);        String s[] = new String[10];        int len = 0;        while(matcher.find()){            s[len] = matcher.group();            len++;        }        return Arrays.copyOf(s, len);    }    private static void createRegexTime() {        String nh = "((1[0-9])|(2[0-4])|((0|)[0-9]))";        String duan = "(上午|下午|晚上|凌晨|早上|夜晚|am|pm|)";        String ch = "(((二|)(十|)[零,一,二,三,四,五,六,七,八,九,十]))";        String ce = "(点|:)";        String cme = "(分|)";        String t1 = "((一刻)|(半)|([0-5][0-9]))"+cme;        String t2 = duan+orRegex(nh,ch)+ce+orRegex(t1,"");        regex_time = t2;    }    private static void createRegexDate() {        final String C09 = "([0,1,2,3,4,5,6,7,8,9,十,一,二,三,四,五,六,七,八,九])";        final String C19 = "([1,2,3,4,5,6,7,8,9,一,二,三,四,五,六,七,八,九])";        final String C0 = "[0,十]";        final String C1 = "[1,一]";        final String C01 = "[0,十,1,一]";        final String C02 = "[0,1,2,十,一,二]";        final String C3 = "[3,三]";        final String C12 = "[1,2,一,二]";        final String C17 = "[1,2,3,4,5,6,7,一,二,三,四,五,六,日]";        final String Y = C02+C09+C09+C09;        final String M = orRegex(C09,C1+C02,orRegex(C0,"")+C19);        final String D = orRegex(C12+C09,C3+C01,orRegex(C0,"")+C19);        final String W = "((周|下周|本周)("+C17+"|末))|((下周星期|下礼拜|礼拜|下星期|星期)("+C17+"|天))";        final String M_END1 = "月";        final String D_END1 = "(日|号)";        final String Y_END1 = "年";        final String E = "(-|\\.)";        String year = orRegex(Y+Y_END1,"");        String year_month = year + M + M_END1;        String day = D + D_END1;        String day_week = orRegex(day,"") + W;        String month_day = orRegex(orRegex(M+M_END1,"")+day,M+M_END1+D);        String year_month_day = year_month + day;        regex_date = orRegex(year_month_day,month_day,day_week,year_month);        regex_date  = orRegex(regex_date,Y+E+M+E+D,Y+E+M,M+E+D);    }    public static Map<String,String> words(String input){        String value = "t 1024";        Map<String,String> map = new HashMap<>();        put(map,get(regex_date,input),value);        put(map,get(regex_time,input),value);        return map;    }    private static void put(Map<String,String> map,String keys[],String value){        for (int i = 0; i < keys.length; i++) {            map.put(keys[i], value);        }    }    private static String orRegex(String...args){        String regex = "";        for (int i = 0; i < args.length; i++) {            regex = regex+O + L+args[i]+R;        }        return L+regex.substring(1)+R;    }}

最后构建的regex_date正则居然有2390多个字符,感觉自己的算法好挫啊.