C45的.data和.names文件转化成Arff数据
来源:互联网 发布:淘宝.jar下载 编辑:程序博客网 时间:2024/06/04 19:16
package cn.ac.ict.ics.utils;import lombok.Cleanup;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileReader;import java.io.IOException;import java.nio.charset.Charset;import java.nio.file.Files;import java.nio.file.Paths;import java.util.StringTokenizer;import java.util.Vector;/** * Created by qibaoyuan on 13-8-21. */public class ArffConverter { public void ConvertCommonFile2Arff(String file) throws Exception { String header = file + ".names"; String data = file + ".data"; String arff = file + ".arff"; String[] lines = readArrayOfStringsFromFile(header); boolean ogotClass = false; String classLine = ""; @Cleanup BufferedWriter bw = null; try { bw = Files.newBufferedWriter(Paths.get(arff), Charset.forName("UTF-8")); } catch (IOException e) { e.printStackTrace(); } assert bw != null; System.out.println("@relation '" + header + "_" + data + "'\n"); bw.write("@relation '" + header + "_" + data + "'\n" + "\r\n"); int nnumberOfAttributes = 0; for (int i = 0; i < lines.length; i++) { //skip comments String line = lines[i].trim(); line = line.replaceAll(" ", ""); if (line.startsWith("|") || line.equals("")) { continue; } StringTokenizer stringTokenizer = new StringTokenizer(line, ":"); //Print.dialog("tokens " + stringTokenizer.countTokens()); if (stringTokenizer.countTokens() != 2) { System.err.println("Error parsing line:\n" + line); } String attributeName = stringTokenizer.nextToken(); String values = stringTokenizer.nextToken(); if (i == lines.length - 1) { //class System.out.println(toWekaFormat(attributeName, new String[]{"-1", "0", "1."})); bw.write(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}) + "\r\n"); nnumberOfAttributes++; } else { if (values.endsWith("continuous.")) { System.out.println("@attribute " + attributeName + " numeric"); bw.write("@attribute " + attributeName + " numeric" + "\r\n"); nnumberOfAttributes++; } else { //nominal values System.out.println(toWekaFormat(attributeName, nominalValues(values))); bw.write(toWekaFormat(attributeName, nominalValues(values)) + "\r\n"); nnumberOfAttributes++; } } } System.out.println(classLine + "\n\n" + "@data\n"); bw.write(classLine + "\n\n" + "@data\n" + "\r\n"); @Cleanup BufferedReader dataBr = Files.newBufferedReader(Paths.get(data), Charset.forName("UTF-8")); String line = null; int counter = 0; while (null != (line = dataBr.readLine())) { assert null != line; if (line.trim().startsWith("|")) { continue; } int label = Integer.parseInt(line.substring(line.lastIndexOf(",") + 1)); if (label < 0) line = line.substring(0, line.lastIndexOf(",")) + ",-1"; else if (label == 0) line = line.substring(0, line.lastIndexOf(",")) + ",0"; else line = line.substring(0, line.lastIndexOf(",")) + ",1"; if (counter++ % 5000 == 0) System.out.println("processed:" + counter); //System.out.println(formatDataLine(line, nnumberOfAttributes)); bw.write(formatDataLine(line.replace(" ", ""), nnumberOfAttributes) + "\r\n"); } } private static String formatDataLine(String line, int nnumberOfAttributes) { StringTokenizer stringTokenizer = new StringTokenizer(line, ","); int n = stringTokenizer.countTokens(); if (n != nnumberOfAttributes) { System.err.println("# attributes should be " + nnumberOfAttributes + " but it's " + n + " in line " + line); } StringBuffer stringBuffer = new StringBuffer(); for (int i = 0; i < n - 1; i++) { stringBuffer.append(stringTokenizer.nextToken().trim() + ","); } //I had to use the line below for adult.test because someone added a dot, //which is not present in adult.data (the training part) //stringBuffer.append(takeDot(stringTokenizer.nextToken())); stringBuffer.append(stringTokenizer.nextToken().trim()); return stringBuffer.toString(); } private static String takeDot(String last) { last = last.trim(); last = last.substring(0, last.length() - 1); return last; } private static String toWekaFormat(String attributeName, String[] nominalValues) { String out = "@attribute " + attributeName + " {"; for (int i = 0; i < nominalValues.length - 1; i++) { out += nominalValues[i].trim() + ","; } out += takeDot(nominalValues[nominalValues.length - 1]) + "}"; return out; } private static String[] nominalValues(String line) { StringTokenizer stringTokenizer = new StringTokenizer(line, ","); int n = stringTokenizer.countTokens(); if (n < 2) { System.err.println("Problem parsing line:\n" + line); } String[] out = new String[n]; for (int i = 0; i < n; i++) { out[i] = stringTokenizer.nextToken(); } return out; } public static String[] readArrayOfStringsFromFile(String fileName) { Vector v = readVectorOfStringsFromFile(fileName); if (v.size() < 1) { return null; } String[] out = new String[v.size()]; for (int i = 0; i < out.length; i++) { out[i] = (String) v.elementAt(i); } return out; } public static Vector readVectorOfStringsFromFile(String filename) { if (filename == null) { System.err.println("Passed a string that is null !"); } Vector vectorOfStrings = new Vector(); try { BufferedReader bufferedReader = new BufferedReader(new FileReader(filename)); String s = null; int max = 90; int i = 0; while ((s = bufferedReader.readLine()) != null) { if (i++ > max) break; if (s.trim().equals("")) { System.err.println("Skipped blank line"); } else { vectorOfStrings.addElement(s); } } bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); System.err.println("Problem reading file " + filename); } return vectorOfStrings; }}
测试用例
package cn.ac.ict.ics.utils;import org.junit.Test;/** * Created by qibaoyuan on 13-8-21. */public class ArffConverterTest { @Test public void testConvertCommonFile2Arff() throws Exception { ArffConverter arffConverter = new ArffConverter(); String file = "/Users/user/corpus/" + "airlines/airline_14col"; arffConverter.ConvertCommonFile2Arff(file); }}
- C45的.data和.names文件转化成Arff数据
- weka怎么把csv文件转化成arff文件
- java代码实现将时间序列数据集(UCR)转化为weka能识别的.arff文件
- 如何下载UCI数据集转成arff格式的文件
- 读取和保存arff文件
- weka关联的ARFF文件
- 构造arff格式的文件
- java调用weka,读取csv及arff文件,将csv转化为arff文件
- [MoonML]-决策树C45的计算过程和其中的疑问
- matlab 读arff文件时的问题
- Data Names
- 用Java创建weka需要的Instance对象和arff文件
- weka中的arff文件
- Weka中的ARFF文件
- Matlab读取.arff文件
- 在Java中使用weka:将实例转化为ARFF文件
- java代码实现将无表头.txt文本文件转化为weka能识别.arff文件
- 在多标签分类中,准备mulan开源软件所需要的.arff和.xml数据的方法
- java装饰着模式实现--字符小写输出
- hdu 2841 Visible Trees(容斥定理)
- jstring与const char* 相互转换
- 两个单链表,存在交点,求交点
- JSP自定义标签开发入门
- C45的.data和.names文件转化成Arff数据
- 约瑟环夫问题
- 找出旋转数中的第一个数
- cocos2d-x win32程序移植到Androidb
- [matlab]mathworks上的cody challenge题解及一些常用函数的总结(2)
- 在Redhat Enterprise Linux 5.3下安装Oracle10g Release2(转载)
- 八皇后
- EBS GL总账过账
- test