使用golang读取StarDict 词典idx文件并按行输出

来源:互联网 发布:盛世赢家炒股软件 编辑:程序博客网 时间:2024/06/14 23:39

StarDict 词典idx文件格式:


每一个条目在单词列表中包含陆续三个域:

word_str;                 // a utf-8 string terminated by '\0'.

                                  // 一个 utf-8 编码字符串,以 '\0' 终止符结束。word_str 的长度将是小于 256 的

word_data_offset; // word data's offset in .dict file

                                  // 单词数据在 .dict 文件中的偏移,

                                  //If the version is "3.0.0" and "idxoffsetbits=64", 

                                  //word_data_offset will be 64-bits unsigned number in network byte order. 


word_data_size;   // word data's total size in .dict file

                                  // 单词数据在 .dict 文件中的总大小,word_data_size should be 32-bits unsigned number

                                  // in network byte order.


package mainimport (//    "bufio"    "io"    "os"    "fmt"     "strconv")func main() {    fi, err := os.Open("gaojihanyudacidian_fix.idx")/*现代汉语词典\\*/    if err != nil { panic(err) }    defer fi.Close()    fo, err := os.Create("output.txt")    if err != nil { panic(err) }    defer fo.Close()/*每次只读4个字节*/    read_buf := make([]byte,4)    jiange := make([]byte,1)    huanghuang := make([]byte,1)    jiange[0]=9     huanghuang[0]=10       var pos,nextPos uint64 = 0,0    /*标记当下需要读取的是哪个域的数据    为1是word_str      为2是word_data_offset      为3是word_data_size*/     var setp int = 1      /*记录已经存储在word_str中的字符个数(不包括字符串最后的\0),也就是说下次从word_str[lenth_of_word_str]处开始存*/    var lenth_of_word_str,charNum int = 0,0    word_str := make([]byte,257)    var tmpChar byte =0    var word_data_offset uint64 = 0      var word_data_size uint64 = 0        count :=1    for{    pos=nextPoscount++        n,err := fi.ReadAt(read_buf,(int64)(pos))        if err != nil && err != io.EOF{            panic(err)        }        /*文件格式要求:读取数据小于4个字节时说明文件结束*/        if   n < 4{            fmt.Printf("\nfinish read\n")            break        }switch setp {case 1://fmt.Println("1:")tmpChar=read_buf[0]/*如果第一个字符是‘\0’说明该步骤已经结束*/if  tmpChar !=  0{/*因为可能有逗号,所以可能只有1个或者俩个字节组成一个字符*/if tmpChar < 128{charNum=1/*charNum记录utf8编码的字符数*/}else if tmpChar < 194{panic(err)}else if tmpChar < 224{charNum=2}else if tmpChar < 240{charNum=3}else{panic(err)}read_buf[charNum]=0str1 := (string)(read_buf[0:charNum+1])copy(word_str[lenth_of_word_str:lenth_of_word_str+charNum], read_buf[:charNum])lenth_of_word_str=lenth_of_word_str+charNumnextPos=nextPos+(uint64)(charNum)continue}else{word_str[lenth_of_word_str]=9nextPos=nextPos+1}        case 2:word_data_offset  =0word_data_offset  = word_data_offset+((uint64)(read_buf[0]))*16*16*16*16*16*16word_data_offset  = word_data_offset+((uint64)(read_buf[1]))*16*16*16*16word_data_offset  = word_data_offset+((uint64)(read_buf[2]))*16*16word_data_offset  = word_data_offset+(uint64)(read_buf[3])nextPos=nextPos+4//        fmt.Printf("word_data_offset  =%d   \n",word_data_offset/*(uint64)(read_buf[3])*/)case 3:word_data_size  =0word_data_size  = word_data_size+(uint64)(read_buf[0])*16*16*16*16*16*16word_data_size  = word_data_size+(uint64)(read_buf[1])*16*16*16*16word_data_size  = word_data_size+(uint64)(read_buf[2])*16*16word_data_size  = word_data_size+(uint64)(read_buf[3])nextPos=nextPos+4default:        }        /*现在可以写入了*/if setp == 3{if _,err := fo.Write( word_str[:lenth_of_word_str+1]); err != nil{    panic(err)}        word_data_offset_str := strconv.FormatUint(word_data_offset,10)          word_data_size_str := strconv.FormatUint(word_data_size,10)   if _,err := fo.WriteString(  word_data_offset_str); err != nil{    panic(err)} if _,err := fo.Write( jiange[0:1]); err != nil{    panic(err)}if _,err := fo.WriteString( word_data_size_str); err != nil{    panic(err)}if _,err := fo.Write( huanghuang[0:1]); err != nil{    panic(err)}lenth_of_word_str=0}        setp=setp+1        if setp > 4{setp = 1   }    }}


原创粉丝点击