erlang踩坑记录之os_mon

来源:互联网 发布:襄阳java招聘信息 编辑:程序博客网 时间:2024/06/05 11:11

1、现象

          最近为系统开发一个监控工具,最先想到的就是应用erlang自带的os_mon,非常实用,它可以监控每个进程内存使用比例等等信息,但是在实际开发过程中,遇到了不少麻烦。

2、分析过程

       启动os_mon,过程非常简单,代码如下:
application:start(os_mon).

       注意,os_mon依赖于sasl,如果没有启动sasl,先启动sasl。执行代码,发现代码执行失败了,错误信息如下:
(game4@192.168.1.130)5> os_mon:start([], []).** exception exit: shutdown=CRASH REPORT==== 1-Sep-2017::01:01:29 ===  crasher:    initial call: os_mon_sysinfo:init/1    pid: <0.371.0>    registered_name: []    exception exit: {enoent,                        [{erlang,open_port,                             [{spawn,                                  "d:/Program Files (x86)/erl5.9.1/lib/os_mon-2.2.9/priv/bin/win32sysinfo.exe"},                              [{packet,1}]],                             []},                         {os_mon,open_port,2,[{file,"os_mon.erl"},{line,88}]},                         {os_mon_sysinfo,start_portprogram,0,                             [{file,"os_mon_sysinfo.erl"},{line,111}]},                         {os_mon_sysinfo,init,1,                             [{file,"os_mon_sysinfo.erl"},{line,60}]},                         {gen_server,init_it,6,                             [{file,"gen_server.erl"},{line,304}]},                         {proc_lib,init_p_do_apply,3,                             [{file,"proc_lib.erl"},{line,227}]}]}      in function  gen_server:init_it/6 (gen_server.erl, line 328)    ancestors: [os_mon_sup,<0.51.0>]    messages: []    links: [<0.370.0>]    dictionary: []    trap_exit: true    status: running    heap_size: 987    stack_size: 24    reductions: 550  neighbours:(game4@192.168.1.130)6> 

 很长一串错误,看错误位置,应该是erlang执行外部程序win32sysinfo.exe时报错,很奇怪的错误,从头开始追踪,首先是os_mon.erl
init([]) ->    SupFlags = case os:type() of   {win32, _} ->       {one_for_one, 5, 3600};   _ ->       {one_for_one, 4, 3600}       end,    SysInf = childspec(sysinfo, startp(sysinfo)),    DskSup = childspec(disksup, startp(disksup)),    MemSup = childspec(memsup,  startp(memsup)),    CpuSup = childspec(cpu_sup, startp(cpu_sup)),    OsSup  = childspec(os_sup,  startp(os_sup)),    {ok, {SupFlags, SysInf ++ DskSup ++ MemSup ++ CpuSup ++ OsSup}}.childspec(_Service, false) ->    [];childspec(cpu_sup, true) ->    [{cpu_sup, {cpu_sup, start_link, []},      permanent, 2000, worker, [cpu_sup]}];childspec(disksup, true) ->    [{disksup, {disksup, start_link, []},      permanent, 2000, worker, [disksup]}];childspec(memsup, true) ->    [{memsup, {memsup, start_link, []},      permanent, 2000, worker, [memsup]}];childspec(os_sup, true) ->    OS = os:type(),    Mod = case OS of      {win32, _} -> nteventlog; % windows      _ -> os_sup % solaris  end,    [{os_sup, {os_sup, start_link, [OS]},      permanent, 10000, worker, [Mod]}];childspec(sysinfo, true) ->    [{os_mon_sysinfo, {os_mon_sysinfo, start_link, []},      permanent, 2000, worker, [os_mon_sysinfo]}].

从以上代码可以看出来,os_mon启动在init/1的时候,检查disk_sup,mem_sup,cpu_sup,os_sup等是否满足启动条件,满足则启动,从前面的报错信息中可以看出来,应该在sysinfo这里报错了,所以,继续追查sysinfo,代码模块是os_mon_sysinfo.erl如下:
init([]) ->    process_flag(trap_exit, true),    process_flag(priority, low),    Port = case os:type() of       {win32, _OSname} -> start_portprogram();       OS -> exit({unsupported_os, OS})   end,    {ok, #state{port=Port}}.start_portprogram() ->    Port = os_mon:open_port("win32sysinfo.exe", [{packet,1}]),    receive{Port, {data, [?OK]}} ->    Port;{Port, {data, Data}} ->    exit({port_error, Data});{'EXIT', Port, Reason} ->    exit({port_died, Reason})    after 5000 ->    exit({port_error, timeout})    end.

从上面的代码中可以看出,显然的sysinfo启动的时候,启动port命令os_mon:open_port("win32sysinfo.exe", [{packet,1}]),那问题应该就在open_port的这部分代码,如下:
open_port(Name, Opts) ->    PrivDir = code:priv_dir(os_mon),    ReleasedPath = filename:join([PrivDir,"bin",Name]),    %% Check os_mon*/priv/bin/Name    case filelib:is_regular(ReleasedPath) oftrue ->    erlang:open_port({spawn, ReleasedPath}, Opts);false ->    %% Use os_mon*/priv/bin/Arch/Name    ArchPath =filename:join(  [PrivDir,"bin",erlang:system_info(system_architecture),Name]),    erlang:open_port({spawn, ArchPath}, Opts)    end.

很显然,当需要启动sysinfo的时候,erlang会根据os_mon的路径(代码 code:priv_dir(os_mon)),也就是erlang的安装路径,查找并执行win32sysinfo.exe,但是如果erlang安装路径中存在空格,那么这个执行的命令就会被操作系统分割成命令+参数的形式,而这种形式肯定是不可能执行成功的。erlang:open_port/2文档描述如下:
{spawn, Command}

Starts an external program. Command is the name of the external program which will be run.Command runs outside the Erlang work space unless an Erlang driver with the nameCommand is found. If found, that driver will be started. A driver runs in the Erlang workspace, which means that it is linked with the Erlang runtime system.

When starting external programs on Solaris, the system call vfork is used in preference to fork for performance reasons, although it has a history of being less robust. If there are problems with usingvfork, setting the environment variableERL_NO_VFORK to any value will cause fork to be used instead.

For external programs, the PATH is searched (or an equivalent method is used to find programs, depending on operating system). This is done by invoking the shell on certain platforms. The first space separated token of the command will be considered as the name of the executable (or driver). This (among other things) makes this option unsuitable for running programs having spaces in file or directory names. Use {spawn_executable, Command} instead if spaces in executable file names is desired.

3、解决办法

       分析了问题是怎么产生的,解决办法也就有了,按照文档描述,可以使用{spawn_executable, Command}的方式替代,如果不想修改erlang的源码,那么就换安装路径吧。

4、总结

      很早之前就听说过编程工具安装不要在有空格有中文路径下,一开始不是很理解,果然实践出真理。