C45的.data和.names文件转化成Arff数据

来源:互联网 发布:淘宝.jar下载 编辑:程序博客网 时间:2024/06/04 19:16
package cn.ac.ict.ics.utils;import lombok.Cleanup;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileReader;import java.io.IOException;import java.nio.charset.Charset;import java.nio.file.Files;import java.nio.file.Paths;import java.util.StringTokenizer;import java.util.Vector;/** * Created by qibaoyuan on 13-8-21. */public class ArffConverter {    public void ConvertCommonFile2Arff(String file) throws Exception {        String header = file + ".names";        String data = file + ".data";        String arff = file + ".arff";        String[] lines = readArrayOfStringsFromFile(header);        boolean ogotClass = false;        String classLine = "";        @Cleanup        BufferedWriter bw = null;        try {            bw = Files.newBufferedWriter(Paths.get(arff), Charset.forName("UTF-8"));        } catch (IOException e) {            e.printStackTrace();        }        assert bw != null;        System.out.println("@relation '" + header + "_" + data + "'\n");        bw.write("@relation '" + header + "_" + data + "'\n" + "\r\n");        int nnumberOfAttributes = 0;        for (int i = 0; i < lines.length; i++) {            //skip comments            String line = lines[i].trim();            line = line.replaceAll(" ", "");            if (line.startsWith("|") || line.equals("")) {                continue;            }            StringTokenizer stringTokenizer = new StringTokenizer(line, ":");            //Print.dialog("tokens " + stringTokenizer.countTokens());            if (stringTokenizer.countTokens() != 2) {                System.err.println("Error parsing line:\n" + line);            }            String attributeName = stringTokenizer.nextToken();            String values = stringTokenizer.nextToken();            if (i == lines.length - 1) {                //class                System.out.println(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}));                bw.write(toWekaFormat(attributeName, new String[]{"-1", "0", "1."}) + "\r\n");                nnumberOfAttributes++;            } else {                if (values.endsWith("continuous.")) {                    System.out.println("@attribute " + attributeName + " numeric");                    bw.write("@attribute " + attributeName + " numeric" + "\r\n");                    nnumberOfAttributes++;                } else {                    //nominal values                    System.out.println(toWekaFormat(attributeName, nominalValues(values)));                    bw.write(toWekaFormat(attributeName, nominalValues(values)) + "\r\n");                    nnumberOfAttributes++;                }            }        }        System.out.println(classLine + "\n\n" + "@data\n");        bw.write(classLine + "\n\n" + "@data\n" + "\r\n");        @Cleanup        BufferedReader dataBr = Files.newBufferedReader(Paths.get(data), Charset.forName("UTF-8"));        String line = null;        int counter = 0;        while (null != (line = dataBr.readLine())) {            assert null != line;            if (line.trim().startsWith("|")) {                continue;            }            int label = Integer.parseInt(line.substring(line.lastIndexOf(",") + 1));            if (label < 0)                line = line.substring(0, line.lastIndexOf(",")) + ",-1";            else if (label == 0)                line = line.substring(0, line.lastIndexOf(",")) + ",0";            else                line = line.substring(0, line.lastIndexOf(",")) + ",1";            if (counter++ % 5000 == 0)                System.out.println("processed:" + counter);            //System.out.println(formatDataLine(line, nnumberOfAttributes));            bw.write(formatDataLine(line.replace(" ", ""), nnumberOfAttributes) + "\r\n");        }    }    private static String formatDataLine(String line, int nnumberOfAttributes) {        StringTokenizer stringTokenizer = new StringTokenizer(line, ",");        int n = stringTokenizer.countTokens();        if (n != nnumberOfAttributes) {            System.err.println("# attributes should be " + nnumberOfAttributes +                    " but it's " + n + " in line " + line);        }        StringBuffer stringBuffer = new StringBuffer();        for (int i = 0; i < n - 1; i++) {            stringBuffer.append(stringTokenizer.nextToken().trim() + ",");        }        //I had to use the line below for adult.test because someone added a dot,        //which is not present in adult.data (the training part)        //stringBuffer.append(takeDot(stringTokenizer.nextToken()));        stringBuffer.append(stringTokenizer.nextToken().trim());        return stringBuffer.toString();    }    private static String takeDot(String last) {        last = last.trim();        last = last.substring(0, last.length() - 1);        return last;    }    private static String toWekaFormat(String attributeName, String[] nominalValues) {        String out = "@attribute " + attributeName + " {";        for (int i = 0; i < nominalValues.length - 1; i++) {            out += nominalValues[i].trim() + ",";        }        out += takeDot(nominalValues[nominalValues.length - 1]) + "}";        return out;    }    private static String[] nominalValues(String line) {        StringTokenizer stringTokenizer = new StringTokenizer(line, ",");        int n = stringTokenizer.countTokens();        if (n < 2) {            System.err.println("Problem parsing line:\n" + line);        }        String[] out = new String[n];        for (int i = 0; i < n; i++) {            out[i] = stringTokenizer.nextToken();        }        return out;    }    public static String[] readArrayOfStringsFromFile(String fileName) {        Vector v = readVectorOfStringsFromFile(fileName);        if (v.size() < 1) {            return null;        }        String[] out = new String[v.size()];        for (int i = 0; i < out.length; i++) {            out[i] = (String) v.elementAt(i);        }        return out;    }    public static Vector readVectorOfStringsFromFile(String filename) {        if (filename == null) {            System.err.println("Passed a string that is null !");        }        Vector vectorOfStrings = new Vector();        try {            BufferedReader bufferedReader = new BufferedReader(new FileReader(filename));            String s = null;            int max = 90;            int i = 0;            while ((s = bufferedReader.readLine()) != null) {                if (i++ > max) break;                if (s.trim().equals("")) {                    System.err.println("Skipped blank line");                } else {                    vectorOfStrings.addElement(s);                }            }            bufferedReader.close();        } catch (IOException e) {            e.printStackTrace();            System.err.println("Problem reading file " + filename);        }        return vectorOfStrings;    }}

测试用例

package cn.ac.ict.ics.utils;import org.junit.Test;/** * Created by qibaoyuan on 13-8-21. */public class ArffConverterTest {    @Test    public void testConvertCommonFile2Arff() throws Exception {        ArffConverter arffConverter = new ArffConverter();        String file = "/Users/user/corpus/" +                "airlines/airline_14col";        arffConverter.ConvertCommonFile2Arff(file);    }}


原创粉丝点击