python 清新脏数据

来源：互联网发布：网络推广电话销售编辑：程序博客网时间：2024/05/02 15:42

调取文本数据，清洗后写入文本

# -*- coding: utf-8 -*-#!/usr/bin/env pythonimport sysimport reimport jsonimport xlwtimport xlrdfrom xlutils.copy import copyreload(sys)sys.setdefaultencoding('utf-8')#清洗脏数据def testMunicipalCommitteeContent():    #print '清洗脏数据'    data = xlrd.open_workbook(u'D:\PythonFiles\clearData\今日头条.xlsx')    table = data.sheets()[0]    nrows = table.nrows    ncols = table.ncols  # 列数    wb = xlwt.Workbook()    ws = wb.add_sheet('News', cell_overwrite_ok=True)    for x in range(ncols):        # print type(table.row_values(0)[x])        # print type(u'你好')        ws.write(0, x, table.row_values(0)[x])        ws.write(1, x, table.row_values(1)[x])    ws.write(0, ncols, u'图片链接')    ws.write(1, ncols, 'picture_id')    for i in range(2,nrows):        #print '内容'        #print table.row_values(5)[ncols-10]        s = ''        if table.row_values(i)[ncols-10]:            s1=table.row_values(i)[ncols-10]            #print s1            if re.findall(r'src=""(.+?)""', s1):                s=re.findall(r'src=""(.+?)""', s1)[0]            else:                s=''            str1=re.findall(r'<p>(.+?)</p>', s1)            str=''            for j in range(len(str1)):                str +=str1[j]            #print str            resultMiddle=re.subn(u'<img(.*?)"">', '', str)            resultMiddle = re.subn(u'<strong>(.*?)</strong>', '', resultMiddle[0])            resultMiddle = re.subn(u'↑(.*?)关注我们', '', resultMiddle[0])            resultMiddle = re.subn(u'<b(.*?)r>', '', resultMiddle[0])            result= resultMiddle[0]        else:            result=''            #print '空格是空的'        for m in range(ncols-10):            ws.write(i, m, table.row_values(i)[m])        ws.write(i,ncols-10,result)        for m in range(ncols-9,ncols):            ws.write(i, m, table.row_values(i)[m])        ws.write(i, ncols, s)    wb.save(r'D:\PythonFiles\clearData\todayNews.xls')if __name__ == '__main__':    testMunicipalCommitteeContent()

清洗数据，把正则表达式写到文本上读取，清洗

#!/usr/bin/env python# -*- coding: utf-8 -*-## Copyright @2016 R&D, CINS Inc. (cins.com)## Author: Eric x.sun <followyourheart1211@gmail.com>#import osimport reimport sysfrom optparse import OptionParserimport common_filter_regeximport settingsreload(sys)sys.setdefaultencoding("utf-8")# 内容中的http开头的URLREGEX_URL = "http" + u"[^\u4e00-\u9fa5]+"# .jpg or .JPGREGEX_JPG = ".jpg"def remove_useless(content, fileurl, encoding="utf-8"):    """remove the useless in content.    Args:        content: The content, in string format, to be replaced.        encoding: The encode of content.    Returns:        The content after removing.    """    REGEX = open(fileurl, 'r')    REGEX_TEXT = []    for eachline in REGEX:        eachline = eachline.strip().decode(encoding)        REGEX_TEXT.append(eachline)    content = content.strip()    if content:        for line in REGEX_TEXT:            content = re.subn(line,'', content.decode(encoding))[0]        content = re.subn(REGEX_URL,"", content.decode(encoding))[0]        content = re.subn(REGEX_JPG,"", content.decode(encoding))[0]        # 句首标点        content = content.decode(encoding).lstrip(settings.PUNCTUATIONS)    print '111111111111111'    print content.strip()    # 句首句末空格    return content.strip()def replace_with_space(content):    """replace the useless in content with one space.    Args:        content: The content, in string format, to be replaced.    Returns:        The content after replacing.    """    content = content.strip()    if content:        # 内容中的\n, \r, \t        # see common_replacer.py done at first.        # html        content = common_filter_regex.replace_html_tags(content)        content = common_filter_regex.replace_html_char_entity(content)        content = common_filter_regex.replace_html_url(content, " ")        # [图片]        content = re.compile(ur"[\u56fe\u7247]").sub(" ", content)        # 连续空格        content = re.compile(ur"\s{2,}").sub(" ", content)    return content.strip()def read_input(fd, delimiter):    for obj in fd:        yield obj.strip().split(delimiter)def check_parameters(**kwargs):    """Check whether the parameters satisfy the conditions.    Args:        delimiter: The delimiter between columns.        indexes: A array of indexes of the content.        data: The file name of the data.    Returns:        A boolean value for representing the status of the checking.    """    delimiter = kwargs.get("delimiter", None)    if delimiter is None:        msg = [            "The delimiter is required.",            "Use '-s' in console mode or 'delimiter=' in func call to set it."        ]        print("{0}".format("\n".join(msg)))        return False    indexes = kwargs.get('index', 0)    if indexes is None:        msg = [            "The indexes is required.",            "Use '-i' in console mode or 'index=' in func call to set it."        ]        print("{0}".format("\n".join(msg)))        return False    data = kwargs.get('data', None)    if data is not None and not os.path.isfile(data):        print("The data does not exist: {0}.".format(data))        return False    return Truedef main(delimiter, indexes, data, out, clean):    indexes = map(lambda i: int(i) - 1, filter(lambda i: i.isdigit(), indexes.split("|")))    stdin = sys.stdin if data is None else open(data, "rb")    stdout = sys.stdout if out is None else open(out, "wb")    delimiter = settings.FIELD_DELIMITER[delimiter] if delimiter in settings.FIELD_DELIMITER.keys() else delimiter    print delimiter, indexes    # temp = 0    for obj in read_input(stdin, delimiter):        empty_line = False        # temp = temp + 1        # if temp == 20000:        #     break        for i in indexes:            print i            try:                print '================='                print ''.join(obj[i])                str=''                str1= re.findall(r'<p>(.+?)</p>', obj[i])                print ''.join(str1)                print str1[0]                if len(str1) == 0:                    str=obj[i]                else:                    for k in str1:                        str +=k                print '999999999999999'                print str                obj[i] = replace_with_space(remove_useless(str, clean, "utf-8"))                #print remove_useless(str, clean, "utf-8")                #obj[i] = remove_useless(str, clean, "utf-8")                print '8888888888888888888'                print obj[i]                empty_line = False if obj[i] else True            # obj[i] = remove_useless(obj[i], clean, "utf-8")            except:                pass                # print len(obj)        if not empty_line:            stdout.write("{0}\n".format(delimiter.join(obj).strip()))    if data is not None:        stdin.close()    if out is not None:        stdout.close()    return Trueif __name__ == "__main__":    data_firl = r"D:\PythonFiles\clearData\2017041820.news_zhengwen"    #data_firl = r"D:\PythonFiles\clearData\test.news_zhengwen"    out_firl=r"D:\PythonFiles\clearData\tetete.news_zhengwen"    clean_firl = 'clearn.txt'    main('\001', '19', data_firl, out_firl,clean_firl)    # parser = OptionParser(usage="%prog -s delimiter -i index_array -d data  -o out -c clean")    #    # parser.add_option(    #     "-s", "--delimiter",    #     help=u"The delimiter between columns, like \001"    # )    #    # parser.add_option(    #     "-i", '--index_array',    #     help=u"Array of index in content, that need to been cleaned, starts at 1, like \"1|3|4\"."    # )    #    # parser.add_option(    #     "-d", "--data",    #     help=u"The file name of the data to be tagged(includes the full path)"    # )    #    # parser.add_option(    #     "-o", "--out",    #     help=u"The file name of the cleaned data(includes the full path)"    # )    #    # parser.add_option(    #     "-c", "--clean",    #     help=u"The file name of the file of cleaning data(includes the full path)"    # )    #    # if not sys.argv[1:]:    #     parser.print_help()    #     exit(1)    #    # (opts, args) = parser.parse_args()    # main(delimiter=opts.delimiter, indexes=opts.index_array, data=opts.data, out=opts.out, clean=opts.clean)

清洗添加数据 clearn.txt ：

<img(.*?)"><strong>(.*?)</strong>↑(.*?)关注我们<b(.*?)r>

阅读全文

0 0