简易英文分词算法(python)

来源:互联网 发布:郭天祥单片机下载 编辑:程序博客网 时间:2024/04/27 03:17

可以区分日期、分数、百分数、十进制计数法、常用缩写
但是还有诸多问题,同样地,结课后如有机会我会完善
—— 2017.10.27

# -*- coding: utf-8 -*-__author__ = 'Zhao'import restr = input("please input a pharagraph:\n")# trans SPECIAL CHARACTERstr = re.sub(r'Prof\.|prof\.', 'professor', str)str = re.sub(r'ies\W|ies$', 'i ', str)str = re.sub(r'i\'m|I\'m', 'I am ', str)str = re.sub(r'it\'s|It\'s', 'It is ', str)str = re.sub(r'can\'t|Can\'t', 'can not', str)str = re.sub(r'doesn\'t|Doesn\'t', 'does not', str)str = re.sub(r'\'re', " are", str)str = re.sub(r'i\W|i$', 'y ', str)str = re.sub(r's\W|s$', ' ', str)str = re.sub(r'let\'|Let\'', 'let us', str)str = re.sub(r'\Wy\W|\Wy$', ' I ', str)  # 针对i的修正str = re.sub(r'\Wi\W|\Wi$', ' is ', str)  # 针对is的修正# matchwhile len(str) > 0:    while str[0] == ' ':        str = re.sub(' ', '', str, count=1)    pattern_date = re.compile(r'(\d{4})/(10|11|12|0\d{1}|\d{1})(/([12]\d{1}|3[01]|0\d{1}|\d{1}))?(^/)*')    match_date = re.match(pattern_date, str)    if match_date:        print('[' + match_date.group() + ']', end=' ')        str = re.sub(pattern_date, '', str, count=1, flags=0)        continue    # match percentage    pattern_percentage = re.compile(r'[\+|\-]?\d+(.\d+)?%')    match_percentage = re.match(pattern_percentage, str)    if match_percentage:        print('[' + match_percentage.group() + ']', end=' ')        str = re.sub(pattern_percentage, '', str, count=1, flags=0)        continue    # match number    pattern_num = re.compile(r'[0-9]+(,[0-9]+)*(.[0-9]+)*')  # ATTENTION 无法分辨每个, 之间的数位    match_num = re.match(pattern_num, str)    if match_num:        print('[' + match_num.group() + ']', end=' ')        str = re.sub(pattern_num, '', str, count=1, flags=0)        continue    # NORMAL PART    while str[0] == ',':        str = re.sub(',', '', str, count=1)    while str[0] == '.':        str = re.sub('.', '', str, count=1)    while str[0] == ' ':        str = re.sub(' ', '', str, count=1)    match_space = re.search(r' ', str)    if match_space:        u1, u2 = str.split(' ', 1)        str = u2        # match comma        pattern_comma = re.compile(r'\w+,')        match_comma = re.match(pattern_comma, u1)        if match_comma:            print('[' + u1.split(',', 1)[0] + ']', end=' ')            continue        # match dot        pattern_dot = re.compile(r'\w+\.')        match_dot = re.match(pattern_dot, u1)        if match_dot:            print('[' + u1.split('.', 1)[0] + ']', end=' ')            continue        print('[' + u1 + ']', end=' ')    else:        u1 = str        # match comma        pattern_comma = re.compile(r'\w+,')        match_comma = re.match(pattern_comma, u1)        if match_comma:            print('[' + u1.split(',', 1)[0] + ']', end=' ')            str = str.split(',', 1)[1]            continue        # match dot        pattern_dot = re.compile(r'\w+\.')        match_dot = re.match(pattern_dot, u1)        if match_dot:            print('[' + u1.split('.', 1)[0] + ']', end=' ')            str = str.split('.', 1)[1]            continue        print('[' + u1 + ']', end=' ')        str[0] = ''
原创粉丝点击