Erlang练习:建立文本索引

来源:互联网 发布:美国硕士奖学金知乎 编辑:程序博客网 时间:2024/05/11 19:16

为文本建立索引是文本信息处理的一个重要的任务,给定一个由英文单词构成的文件,为文件中所有单词建立索引,记录每个单词出现的行号和每行出现的次数,并将索引存入一个文件。

-module(text_index).-compile(export_all).-import(re, [run/2,replace/4]).-record(data, {word, line = []}).-define(DBNAME, dataDB).%%% 数据库操作createDB() ->    ets:new(?DBNAME, [public, named_table, set, {keypos, #data.word}]).closeDB() ->    ets:delete(?DBNAME).start(File) ->    case file:open(File, read) of        {ok, IoDevice} ->            Content = readAllText(IoDevice),            Lines = string:tokens(Content, "\r\n"), %讲文件分解每行            processLine(Lines);        _ ->            io:format("Open the file failed!")    end.%%%将文件的内容全部读入到内存中readAllText(IoDevice) ->    readAllText(IoDevice, []).readAllText(IoDevice, Content)->    case file:read(IoDevice, 1024) of        {ok, Text} ->            readAllText(IoDevice, [Content | Text]);        eof ->            file:close(IoDevice),            lists:flatten(Content);        {error, Reason}->            io:format("Read file failed! The reason is:~p~n", [Reason]),            file:close(IoDevice),            {error, Reason}    end.-define(PATTERN, "[a-zA-Z\.]+").%%%对文件的每一行进行处理,提取出每一个单词processLine(Lines) ->    processLine(Lines, 1).processLine([H | T], LineNumber) ->    case re:run(H, ?PATTERN, [global]) of        {match, MatchItem} ->           Words =  splitWords(H, lists:flatten(MatchItem)),           writeDB(Words, LineNumber);        nomatch ->            ok    end,    processLine(T, LineNumber + 1);processLine([], _LineNumber) ->    ok.splitWords(Line, MatchItem) ->    splitWords(Line, MatchItem, []).splitWords(Line, [{Start, Length} | T], Words)->    Word = string:substr(Line, Start+1, Length),    splitWords(Line, T, [Word | Words]);splitWords(_Line, [], Words)->    lists:reverse(Words).%%% 将数据额写入到ets数据库中writeDB([Key | T], LineNumber)->    case ets:match_object(dataDB, #data{word = Key, _ =  '_'}) of        [#data{word = Key, line = Value}] ->            ets:insert(dataDB, #data{word = Key, line = updateData(Value, LineNumber)}),            writeDB(T, LineNumber);        _ ->            ets:insert(dataDB, #data{word = Key, line = [{LineNumber, 1}]}),            writeDB(T, LineNumber)    end;writeDB([], _) ->    ok.%%%对数据进行更新,此方法不太好。updateData(LineValue, LineNumber) ->    updateData(LineValue, LineNumber, [], false).updateData([{LineNumber, Times} | Tail], LineNumber, Res, false) ->    updateData(Tail, LineNumber, [{LineNumber, Times + 1} | Res], true);updateData([], LineNumber, Res, false) ->    lists:keysort(1, [{LineNumber, 1} | Res]);updateData([], _LineNumber, Res, true)->    lists:keysort(1, Res);updateData([H | T], LineNumber, Res, Tmp)->    updateData(T, LineNumber, [H | Res], Tmp).main() ->    FileName = "C:\\Users\\elqstux\\Desktop\\wy.py",    createDB(),    start(FileName),    closeDB().

updateData/2 的简洁版本,利用lists模块的内建函数。

updateData(LineValue, LineNumber)->    case lists:keysearch(LineNumber, 1, LineValue) of        {value, {LineNumber, Times}} ->            lists:keyreplace(LineNumber, 1, LineValue, {LineNumber, Times + 1});        false ->            [{LineNumber, 1} | LineValue]    end.


原创粉丝点击