python数据预处理练习

来源:互联网 发布:什么是软件开发模式 编辑:程序博客网 时间:2024/06/14 22:12
#ecoding=utf-8import mathimport reimport csvdef fileREAD(fileURL,access):    "传入文件路径,返回存储文件内容的二维列表"    localArray = []  # 创建一个列表用于存储文件内容    csvfile = file(fileURL, access)    reader = csv.reader(csvfile)    for line in reader:        localArray.append(line)    csvfile.close()    return localArraydef getLine(inList,Line):    "获得某一行数据"    return inList[Line]def getRow(inList,Row):    "获得某一列数据"    listReturn = []    for i in inList:        listReturn.append(i[Row])    return listReturndef setLine(inList,childList,Line):    "设置矩阵某一行数据"    inList[Line] = childListdef setRow(inList,chikdList,Row):    "设置矩阵的某一列"    i = 0    for i in range(0,len(chikdList)):        inList[i][Row] = chikdList[i]def addLine(inList,childLine):    "给数据矩阵添加一行"    inList.append(childLine)def addRow(inList,childRow):    "给数据矩阵添加一列"    j = 0    for i in inList:        i.append(childRow[j])        j = j+1def getAVG(inList):    "求数值属性的均值"    sumOfList = 0    lengOfList = 0    for i in inList:        if re.match(r'[0-9]+',i):            sumOfList = sumOfList + float(i)            lengOfList = lengOfList + 1        else:            continue    if lengOfList != 0 :        return sumOfList/lengOfList    else:        return "当前特征无平均值"def getAVE(inList):    "求数值属性的方差"    #先求平均数    sumOfList = 0    lengOfList = 0    su = 0    for i in inList:        if re.match(r'[0-9]+', i):            sumOfList = sumOfList + float(i)            lengOfList = lengOfList + 1        else:            continue    if lengOfList != 0:        avg = sumOfList / lengOfList        for j in inList:            if re.match(r'[0-9]+',j):                su += (float(j) - avg) ** 2            else:                continue        return math.sqrt(su)    else:        return "当前特征无方差"def average(seq, total=0.0):  num = 0  for item in seq:    total += item    num += 1  return total / numdef getQUANTILE(inList,inlocaltion):    "求数值属性的分位数"    if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1:        return "输入的分位数数值错误"    localLst = []    leng = 0    for i in inList:        if re.match(r'[0-9]+',i):            localLst.append(float(i))            leng = leng + 1        else:            continue    if leng == 0:        return "当前特征不可求中位数"    localLst.sort()    if inlocaltion == 0.5:         if len(localLst)%2 == 1:             return localLst[len(localLst)//2]         else:             return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0    elif inlocaltion<1 and inlocaltion>=0:        return localLst[int(len(localLst)*inlocaltion)]def fileREAD(fileURL,access):    "传入文件路径,返回存储文件内容的二维列表"    localArray = []  # 创建一个列表用于存储文件内容    csvfile = file(fileURL, access)    reader = csv.reader(csvfile)    for line in reader:        localArray.append(line)    csvfile.close()    return localArraydef removeNoiseAuto(inList):    "利用IRQ识别噪声数据并去除该数据"    Q3 = getQUANTILE(inList,0.75)    Q1 = getQUANTILE(inList,0.25)    IRQ = Q3 - Q1    for i in range(1,len(inList),1):        if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ:            inList[i] = ''    return inListdef removeNoiseByThresholdMin(inList,inThresholdMin):    "根据最小阈值去除噪声数据去除该数据"    for i in range(1, len(inList), 1):        if float(inList[i]) < inThresholdMin:            inList[i] = ''    return inListdef removeNoiseByThresholdMax(inList,inThresholdMax):    "根据最大阈值去除噪声数据去除该数据"    for i in range(1, len(inList), 1):        if float(inList[i]) > inThresholdMax:            inList[i] = ''    return inListdef autoPaddingByAVG(inList):    "利用均值补全缺失值"    avg = getAVG(inList)    for i in range(1, len(inList), 1):        if inList[i] == '':            inList[i] = str(avg)    return inListdef autoPaddingByMedian(inList):    "利用中位数补全缺失值"    avg = getQUANTILE(inList,0.5)    for i in range(1, len(inList), 1):        if inList[i] == '':            inList[i] = str(avg)    return inListdef binningWidth(inList,width):    "数据离散化:等宽分箱"    dic = {}    for i in range(1,len(inList)):        dic[i] =float(inList[i])    dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序    dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对    for varlo in dict:        dictList.append(list(varlo))    i = 0  # 用于记录每个箱开始位置    j = 0  #用于记录每个箱结束位置    innerList = []    for i in range(0, len(dictList)):        if dictList[i][1] - dictList[j][1] > width:            avg = average(innerList)            for k in range(j, i, 1):                dictList[k][1] = avg            innerList = []            j = i        innerList.append(dictList[i][1])        if (i == len(dictList)-1):            avg = average(innerList)            for k in range(j, i, 1):                dictList[k][1] = avg            innerList = []            dictList[i][1] = avg    dic1 = {}    for i in range(0, len(dictList)):        dic1[dictList[i][0]] = dictList[i][1]    ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False)  # 先将列表按KEY排序    for i in range(0, len(ad)):        inList[i + 1] = ad[i][1]    return inListdef binningDeep(inList,deep1):    "数据离散化:等频分箱"    deep = deep1 -1    dic = {}    for i in range(1,len(inList)):        dic[i] =float(inList[i])    dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序    dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对    for varlo in dict:        dictList.append(list(varlo))    innerList = []    for i in range(0,deep):  #为了排除0的干扰,首先处理掉deep个元素        innerList.append(dictList[i][1])    for i in range(deep, len(dictList)):        if i % deep == 0:            avg = average(innerList)            for j in range(i-deep,i):                dictList[j][1] = avg            innerList = []        innerList.append(dictList[i][1])        if i == len(dictList)-1:            avg = average(innerList)            for j in range((i+1)/deep*deep,i+1):                dictList[j][1] = avg    dic1 = {}    for i in range(0, len(dictList)):        dic1[dictList[i][0]] = dictList[i][1]    ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False)  # 先将列表按KEY排序    for i in range(0,len(ad)):        inList[i+1] = ad[i][1]    return inListdef oneHot(inList,Row):    "对输入数据矩阵的某一列使用oneHot编码"    rowList0 = getRow(inList,Row)    rowHead = rowList0[0]    rowList = []    for i in range(1,len(rowList0)):        rowList.append(rowList0[i])    rowmsg = {}    j = 0    for i in rowList:        if rowmsg.has_key(i):            rowmsg[i] = rowmsg[i] + 1        else:            rowmsg[i] = 1    for i in rowmsg.keys():        addList = []        addList.append(i)        for j in rowList:            if j == i:                addList.append('1')            else:                addList.append('0')        addRow(inList,addList)    for i in inList:        print idef  minMax(inList):    "最大最小归一化"    innerList = []    for i in range(1,len(inList)):        if re.match(r'[0-9]+', inList[i]):            innerList.append(float(inList[i]))    maxvalue = max(innerList)    minvalue = min(innerList)    for i in range(1,len(inList)):        if re.match(r'[0-9]+', inList[i]):            a = (float(inList[i])-minvalue)/(maxvalue - minvalue)            b = "%.4f" %a            inList[i] = str(b)    return inListdef  zScore(inList):    "zScore归一化"    print inList    u = getAVG(inList)    ave = getAVE(inList)    stand = math.sqrt(ave)    for i in range(1,len(inList)):        if re.match(r'[0-9]+', inList[i]):            a = (float(inList[i])-u)/stand            b = "%.4f" % a            inList[i] = str(b)    return inListdef similarityDistance(inList1,inList2,n):    "距离相似度"    sum = 0    for i in range(1,len(inList1)):       sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n    a = float(1)/2    return pow(sum,a)def similaritySim(inList1,inList2):    "余弦相似度计算"    sum = 0    for i in range(1,len(inList1)):        sum = sum + float(inList1[i])*float(inList2[i])    sum1 = 0    sum2 = 0    for i in range(1,len(inList1)):        sum1 = sum1 + float(inList1[i])**2    for i in range(1, len(inList2)):        sum2 = sum2 + float(inList2[i]) ** 2    return sum/(math.sqrt(sum1)*math.sqrt(sum2))fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")# #获得某一行数据# print getLine(fileInput,1)## #获得某一列数据# print getRow(fileInput,0)# #设置某一行数据# print "设置前:"# print getLine(fileInput,1)# setLine(fileInput,getLine(fileInput,2),1)# print "设置后:"# print getLine(fileInput,1)# #设置某一列数据# print "设置前:"# print getRow(fileInput,1)# setRow(fileInput,getRow(fileInput,2),1)# print "设置后:"# print getRow(fileInput,1)# #均值# print getAVG(getRow(fileInput,9))# #方差# print getAVE(getRow(fileInput,9))# #分位数# print getQUANTILE(getRow(fileInput,9),0.5)# #噪声数据过滤1# print removeNoiseAuto(getRow(fileInput,1))## #噪声数据过滤2# print removeNoiseByThresholdMin(getRow(fileInput,0),10)## #噪声数据过滤3# print removeNoiseByThresholdMax(getRow(fileInput,0),10)# #缺失值补全1# print autoPaddingByAVG(getRow(fileInput,0))## #缺失值补全2# print autoPaddingByMedian(getRow(fileInput,0))# #等宽分箱# print binningWidth(getRow(fileInput,0),3)## #等频分箱# print binningDeep(getRow(fileInput,0),3)# #ONE-HOT编码# oneHot(fileInput,1)# for i in fileInput:#     print i# #最大最小归一化# print minMax(getRow(fileInput,0))## #zScore归一化# print zScore(getRow(fileInput,0))# #距离相似度# print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)# # 余弦相似度计算# print similaritySim(getRow(fileInput,0),getRow(fileInput,1))

1 0
原创粉丝点击