python数据预处理练习
来源:互联网 发布:什么是软件开发模式 编辑:程序博客网 时间:2024/06/14 22:12
#ecoding=utf-8import mathimport reimport csvdef fileREAD(fileURL,access): "传入文件路径,返回存储文件内容的二维列表" localArray = [] # 创建一个列表用于存储文件内容 csvfile = file(fileURL, access) reader = csv.reader(csvfile) for line in reader: localArray.append(line) csvfile.close() return localArraydef getLine(inList,Line): "获得某一行数据" return inList[Line]def getRow(inList,Row): "获得某一列数据" listReturn = [] for i in inList: listReturn.append(i[Row]) return listReturndef setLine(inList,childList,Line): "设置矩阵某一行数据" inList[Line] = childListdef setRow(inList,chikdList,Row): "设置矩阵的某一列" i = 0 for i in range(0,len(chikdList)): inList[i][Row] = chikdList[i]def addLine(inList,childLine): "给数据矩阵添加一行" inList.append(childLine)def addRow(inList,childRow): "给数据矩阵添加一列" j = 0 for i in inList: i.append(childRow[j]) j = j+1def getAVG(inList): "求数值属性的均值" sumOfList = 0 lengOfList = 0 for i in inList: if re.match(r'[0-9]+',i): sumOfList = sumOfList + float(i) lengOfList = lengOfList + 1 else: continue if lengOfList != 0 : return sumOfList/lengOfList else: return "当前特征无平均值"def getAVE(inList): "求数值属性的方差" #先求平均数 sumOfList = 0 lengOfList = 0 su = 0 for i in inList: if re.match(r'[0-9]+', i): sumOfList = sumOfList + float(i) lengOfList = lengOfList + 1 else: continue if lengOfList != 0: avg = sumOfList / lengOfList for j in inList: if re.match(r'[0-9]+',j): su += (float(j) - avg) ** 2 else: continue return math.sqrt(su) else: return "当前特征无方差"def average(seq, total=0.0): num = 0 for item in seq: total += item num += 1 return total / numdef getQUANTILE(inList,inlocaltion): "求数值属性的分位数" if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1: return "输入的分位数数值错误" localLst = [] leng = 0 for i in inList: if re.match(r'[0-9]+',i): localLst.append(float(i)) leng = leng + 1 else: continue if leng == 0: return "当前特征不可求中位数" localLst.sort() if inlocaltion == 0.5: if len(localLst)%2 == 1: return localLst[len(localLst)//2] else: return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0 elif inlocaltion<1 and inlocaltion>=0: return localLst[int(len(localLst)*inlocaltion)]def fileREAD(fileURL,access): "传入文件路径,返回存储文件内容的二维列表" localArray = [] # 创建一个列表用于存储文件内容 csvfile = file(fileURL, access) reader = csv.reader(csvfile) for line in reader: localArray.append(line) csvfile.close() return localArraydef removeNoiseAuto(inList): "利用IRQ识别噪声数据并去除该数据" Q3 = getQUANTILE(inList,0.75) Q1 = getQUANTILE(inList,0.25) IRQ = Q3 - Q1 for i in range(1,len(inList),1): if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ: inList[i] = '' return inListdef removeNoiseByThresholdMin(inList,inThresholdMin): "根据最小阈值去除噪声数据去除该数据" for i in range(1, len(inList), 1): if float(inList[i]) < inThresholdMin: inList[i] = '' return inListdef removeNoiseByThresholdMax(inList,inThresholdMax): "根据最大阈值去除噪声数据去除该数据" for i in range(1, len(inList), 1): if float(inList[i]) > inThresholdMax: inList[i] = '' return inListdef autoPaddingByAVG(inList): "利用均值补全缺失值" avg = getAVG(inList) for i in range(1, len(inList), 1): if inList[i] == '': inList[i] = str(avg) return inListdef autoPaddingByMedian(inList): "利用中位数补全缺失值" avg = getQUANTILE(inList,0.5) for i in range(1, len(inList), 1): if inList[i] == '': inList[i] = str(avg) return inListdef binningWidth(inList,width): "数据离散化:等宽分箱" dic = {} for i in range(1,len(inList)): dic[i] =float(inList[i]) dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序 dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对 for varlo in dict: dictList.append(list(varlo)) i = 0 # 用于记录每个箱开始位置 j = 0 #用于记录每个箱结束位置 innerList = [] for i in range(0, len(dictList)): if dictList[i][1] - dictList[j][1] > width: avg = average(innerList) for k in range(j, i, 1): dictList[k][1] = avg innerList = [] j = i innerList.append(dictList[i][1]) if (i == len(dictList)-1): avg = average(innerList) for k in range(j, i, 1): dictList[k][1] = avg innerList = [] dictList[i][1] = avg dic1 = {} for i in range(0, len(dictList)): dic1[dictList[i][0]] = dictList[i][1] ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False) # 先将列表按KEY排序 for i in range(0, len(ad)): inList[i + 1] = ad[i][1] return inListdef binningDeep(inList,deep1): "数据离散化:等频分箱" deep = deep1 -1 dic = {} for i in range(1,len(inList)): dic[i] =float(inList[i]) dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序 dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对 for varlo in dict: dictList.append(list(varlo)) innerList = [] for i in range(0,deep): #为了排除0的干扰,首先处理掉deep个元素 innerList.append(dictList[i][1]) for i in range(deep, len(dictList)): if i % deep == 0: avg = average(innerList) for j in range(i-deep,i): dictList[j][1] = avg innerList = [] innerList.append(dictList[i][1]) if i == len(dictList)-1: avg = average(innerList) for j in range((i+1)/deep*deep,i+1): dictList[j][1] = avg dic1 = {} for i in range(0, len(dictList)): dic1[dictList[i][0]] = dictList[i][1] ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False) # 先将列表按KEY排序 for i in range(0,len(ad)): inList[i+1] = ad[i][1] return inListdef oneHot(inList,Row): "对输入数据矩阵的某一列使用oneHot编码" rowList0 = getRow(inList,Row) rowHead = rowList0[0] rowList = [] for i in range(1,len(rowList0)): rowList.append(rowList0[i]) rowmsg = {} j = 0 for i in rowList: if rowmsg.has_key(i): rowmsg[i] = rowmsg[i] + 1 else: rowmsg[i] = 1 for i in rowmsg.keys(): addList = [] addList.append(i) for j in rowList: if j == i: addList.append('1') else: addList.append('0') addRow(inList,addList) for i in inList: print idef minMax(inList): "最大最小归一化" innerList = [] for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): innerList.append(float(inList[i])) maxvalue = max(innerList) minvalue = min(innerList) for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): a = (float(inList[i])-minvalue)/(maxvalue - minvalue) b = "%.4f" %a inList[i] = str(b) return inListdef zScore(inList): "zScore归一化" print inList u = getAVG(inList) ave = getAVE(inList) stand = math.sqrt(ave) for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): a = (float(inList[i])-u)/stand b = "%.4f" % a inList[i] = str(b) return inListdef similarityDistance(inList1,inList2,n): "距离相似度" sum = 0 for i in range(1,len(inList1)): sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n a = float(1)/2 return pow(sum,a)def similaritySim(inList1,inList2): "余弦相似度计算" sum = 0 for i in range(1,len(inList1)): sum = sum + float(inList1[i])*float(inList2[i]) sum1 = 0 sum2 = 0 for i in range(1,len(inList1)): sum1 = sum1 + float(inList1[i])**2 for i in range(1, len(inList2)): sum2 = sum2 + float(inList2[i]) ** 2 return sum/(math.sqrt(sum1)*math.sqrt(sum2))fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")# #获得某一行数据# print getLine(fileInput,1)## #获得某一列数据# print getRow(fileInput,0)# #设置某一行数据# print "设置前:"# print getLine(fileInput,1)# setLine(fileInput,getLine(fileInput,2),1)# print "设置后:"# print getLine(fileInput,1)# #设置某一列数据# print "设置前:"# print getRow(fileInput,1)# setRow(fileInput,getRow(fileInput,2),1)# print "设置后:"# print getRow(fileInput,1)# #均值# print getAVG(getRow(fileInput,9))# #方差# print getAVE(getRow(fileInput,9))# #分位数# print getQUANTILE(getRow(fileInput,9),0.5)# #噪声数据过滤1# print removeNoiseAuto(getRow(fileInput,1))## #噪声数据过滤2# print removeNoiseByThresholdMin(getRow(fileInput,0),10)## #噪声数据过滤3# print removeNoiseByThresholdMax(getRow(fileInput,0),10)# #缺失值补全1# print autoPaddingByAVG(getRow(fileInput,0))## #缺失值补全2# print autoPaddingByMedian(getRow(fileInput,0))# #等宽分箱# print binningWidth(getRow(fileInput,0),3)## #等频分箱# print binningDeep(getRow(fileInput,0),3)# #ONE-HOT编码# oneHot(fileInput,1)# for i in fileInput:# print i# #最大最小归一化# print minMax(getRow(fileInput,0))## #zScore归一化# print zScore(getRow(fileInput,0))# #距离相似度# print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)# # 余弦相似度计算# print similaritySim(getRow(fileInput,0),getRow(fileInput,1))
1 0
- python数据预处理练习
- python数据预处理练习
- 1.1股票数据预处理练习
- python数据预处理
- Python数据预处理概述
- 数据预处理练习 等频分箱、one_hot(独热编码)、数据归一化 #python
- python 数据预处理 数据标准化
- python 数据预处理 数据抽样
- Python数据预处理常用函数
- Python-Pandas(3)数据预处理
- Python数据获取和预处理
- 数据预处理的python实现
- 数据预处理练习(深度学习)
- Deep learning:三十一(数据预处理练习)
- python数据预处理之数据清洗
- python . 数据分析1 数据的预处理
- Python数据可视化练习
- python 数据可视化练习
- ida配合windbg调试程序
- uva 11525 Permutation
- Solr调研总结
- 支付宝前端架构专家梁文森(绝云)与你探讨算法与数据结构
- 字符串的startsWith和endWith方法
- python数据预处理练习
- Hadoop2.2集群搭建(1)
- 多线程端口扫描器的实现(java)
- Uva 340 - Master-Mind Hints
- 记一次neo4j项目打包出现的问题
- JQuery处理json与ajax返回JSON实例代码
- 欢迎前往煮梦空间——www.BoilTask.com
- HDU 5755 Gambler Bo
- Structs2 json 打包下载文件,多个文件打成一个压缩包