python 清新脏数据
来源:互联网 发布:网络推广电话销售 编辑:程序博客网 时间:2024/05/02 15:42
调取文本数据,清洗后写入文本
# -*- coding: utf-8 -*-#!/usr/bin/env pythonimport sysimport reimport jsonimport xlwtimport xlrdfrom xlutils.copy import copyreload(sys)sys.setdefaultencoding('utf-8')#清洗脏数据def testMunicipalCommitteeContent(): #print '清洗脏数据' data = xlrd.open_workbook(u'D:\PythonFiles\clearData\今日头条.xlsx') table = data.sheets()[0] nrows = table.nrows ncols = table.ncols # 列数 wb = xlwt.Workbook() ws = wb.add_sheet('News', cell_overwrite_ok=True) for x in range(ncols): # print type(table.row_values(0)[x]) # print type(u'你好') ws.write(0, x, table.row_values(0)[x]) ws.write(1, x, table.row_values(1)[x]) ws.write(0, ncols, u'图片链接') ws.write(1, ncols, 'picture_id') for i in range(2,nrows): #print '内容' #print table.row_values(5)[ncols-10] s = '' if table.row_values(i)[ncols-10]: s1=table.row_values(i)[ncols-10] #print s1 if re.findall(r'src=""(.+?)""', s1): s=re.findall(r'src=""(.+?)""', s1)[0] else: s='' str1=re.findall(r'<p>(.+?)</p>', s1) str='' for j in range(len(str1)): str +=str1[j] #print str resultMiddle=re.subn(u'<img(.*?)"">', '', str) resultMiddle = re.subn(u'<strong>(.*?)</strong>', '', resultMiddle[0]) resultMiddle = re.subn(u'↑(.*?)关注我们', '', resultMiddle[0]) resultMiddle = re.subn(u'<b(.*?)r>', '', resultMiddle[0]) result= resultMiddle[0] else: result='' #print '空格是空的' for m in range(ncols-10): ws.write(i, m, table.row_values(i)[m]) ws.write(i,ncols-10,result) for m in range(ncols-9,ncols): ws.write(i, m, table.row_values(i)[m]) ws.write(i, ncols, s) wb.save(r'D:\PythonFiles\clearData\todayNews.xls')if __name__ == '__main__': testMunicipalCommitteeContent()清洗数据,把正则表达式写到文本上读取,清洗
#!/usr/bin/env python# -*- coding: utf-8 -*-## Copyright @2016 R&D, CINS Inc. (cins.com)## Author: Eric x.sun <followyourheart1211@gmail.com>#import osimport reimport sysfrom optparse import OptionParserimport common_filter_regeximport settingsreload(sys)sys.setdefaultencoding("utf-8")# 内容中的http开头的URLREGEX_URL = "http" + u"[^\u4e00-\u9fa5]+"# .jpg or .JPGREGEX_JPG = ".jpg"def remove_useless(content, fileurl, encoding="utf-8"): """remove the useless in content. Args: content: The content, in string format, to be replaced. encoding: The encode of content. Returns: The content after removing. """ REGEX = open(fileurl, 'r') REGEX_TEXT = [] for eachline in REGEX: eachline = eachline.strip().decode(encoding) REGEX_TEXT.append(eachline) content = content.strip() if content: for line in REGEX_TEXT: content = re.subn(line,'', content.decode(encoding))[0] content = re.subn(REGEX_URL,"", content.decode(encoding))[0] content = re.subn(REGEX_JPG,"", content.decode(encoding))[0] # 句首标点 content = content.decode(encoding).lstrip(settings.PUNCTUATIONS) print '111111111111111' print content.strip() # 句首句末空格 return content.strip()def replace_with_space(content): """replace the useless in content with one space. Args: content: The content, in string format, to be replaced. Returns: The content after replacing. """ content = content.strip() if content: # 内容中的\n, \r, \t # see common_replacer.py done at first. # html content = common_filter_regex.replace_html_tags(content) content = common_filter_regex.replace_html_char_entity(content) content = common_filter_regex.replace_html_url(content, " ") # [图片] content = re.compile(ur"[\u56fe\u7247]").sub(" ", content) # 连续空格 content = re.compile(ur"\s{2,}").sub(" ", content) return content.strip()def read_input(fd, delimiter): for obj in fd: yield obj.strip().split(delimiter)def check_parameters(**kwargs): """Check whether the parameters satisfy the conditions. Args: delimiter: The delimiter between columns. indexes: A array of indexes of the content. data: The file name of the data. Returns: A boolean value for representing the status of the checking. """ delimiter = kwargs.get("delimiter", None) if delimiter is None: msg = [ "The delimiter is required.", "Use '-s' in console mode or 'delimiter=' in func call to set it." ] print("{0}".format("\n".join(msg))) return False indexes = kwargs.get('index', 0) if indexes is None: msg = [ "The indexes is required.", "Use '-i' in console mode or 'index=' in func call to set it." ] print("{0}".format("\n".join(msg))) return False data = kwargs.get('data', None) if data is not None and not os.path.isfile(data): print("The data does not exist: {0}.".format(data)) return False return Truedef main(delimiter, indexes, data, out, clean): indexes = map(lambda i: int(i) - 1, filter(lambda i: i.isdigit(), indexes.split("|"))) stdin = sys.stdin if data is None else open(data, "rb") stdout = sys.stdout if out is None else open(out, "wb") delimiter = settings.FIELD_DELIMITER[delimiter] if delimiter in settings.FIELD_DELIMITER.keys() else delimiter print delimiter, indexes # temp = 0 for obj in read_input(stdin, delimiter): empty_line = False # temp = temp + 1 # if temp == 20000: # break for i in indexes: print i try: print '=================' print ''.join(obj[i]) str='' str1= re.findall(r'<p>(.+?)</p>', obj[i]) print ''.join(str1) print str1[0] if len(str1) == 0: str=obj[i] else: for k in str1: str +=k print '999999999999999' print str obj[i] = replace_with_space(remove_useless(str, clean, "utf-8")) #print remove_useless(str, clean, "utf-8") #obj[i] = remove_useless(str, clean, "utf-8") print '8888888888888888888' print obj[i] empty_line = False if obj[i] else True # obj[i] = remove_useless(obj[i], clean, "utf-8") except: pass # print len(obj) if not empty_line: stdout.write("{0}\n".format(delimiter.join(obj).strip())) if data is not None: stdin.close() if out is not None: stdout.close() return Trueif __name__ == "__main__": data_firl = r"D:\PythonFiles\clearData\2017041820.news_zhengwen" #data_firl = r"D:\PythonFiles\clearData\test.news_zhengwen" out_firl=r"D:\PythonFiles\clearData\tetete.news_zhengwen" clean_firl = 'clearn.txt' main('\001', '19', data_firl, out_firl,clean_firl) # parser = OptionParser(usage="%prog -s delimiter -i index_array -d data -o out -c clean") # # parser.add_option( # "-s", "--delimiter", # help=u"The delimiter between columns, like \001" # ) # # parser.add_option( # "-i", '--index_array', # help=u"Array of index in content, that need to been cleaned, starts at 1, like \"1|3|4\"." # ) # # parser.add_option( # "-d", "--data", # help=u"The file name of the data to be tagged(includes the full path)" # ) # # parser.add_option( # "-o", "--out", # help=u"The file name of the cleaned data(includes the full path)" # ) # # parser.add_option( # "-c", "--clean", # help=u"The file name of the file of cleaning data(includes the full path)" # ) # # if not sys.argv[1:]: # parser.print_help() # exit(1) # # (opts, args) = parser.parse_args() # main(delimiter=opts.delimiter, indexes=opts.index_array, data=opts.data, out=opts.out, clean=opts.clean)
清洗添加数据 clearn.txt :
<img(.*?)"><strong>(.*?)</strong>↑(.*?)关注我们<b(.*?)r>
阅读全文
0 0
- python 清新脏数据
- 清新空气的早晨
- 气质清新似少女
- 一句话毁掉小清新
- 打造小清新风格!
- 我们都是小清新
- 清新的早晨
- Latex 绘图小清新
- 天很近风景很清新
- Latex 绘图小清新
- 清新脱俗的sql
- python 数据
- python 数据
- python数据
- 清新脱俗的TensorFlow CIFAR10例程的代码重构——更简明更快的数据读取、loss accuracy实时输出
- 简单清新的登录表单
- 【热门主题:微距紫丁花清新主题】
- 清新闹钟系列--项目计划书
- 暴搜——51nod1400 序列分解
- java运算符之++、--
- 验证对象在创建时就会先调用(默认)构造方法
- 十分钟搞清字符集和字符编码
- STM32 心电滤波
- python 清新脏数据
- JZOJ 7.11 B组第一题 解题
- 利用Matlab开发基于XSENS Mtw传感器模块的行人室内实时定位系统的相关
- POJ 3173 Parkside's Triangle G++
- windows程序设计 初入门的小理解
- <有用>解决Apache长时间占用内存大的问题,Apache 内存优化方法-以及查看线程进程命令
- AD中设置元件间的间隔规则
- 《Spring源码深度解析》学习笔记——默认标签的解析
- Bitmap_图片硬盘缓存(3)