php解析html dom节点树

来源:互联网 发布:网页制作美工 编辑:程序博客网 时间:2024/05/19 13:10

不得不感叹用DOM直接解析HTML DOM树的灵活和强大,因为基本的HTML元素就是那么几种常见的,再加上ID属性或者CLASS属性之类的。。


在解析html文件时,完全可以用正则中脱离出来,毕竟HTML文件中存在大量相似的模式,而且代码看上去功能比较显而易见,当然正则是非常强大的,应用的领域也更广。。


代码如下:


<?php//关闭载入包含js时的警告提示error_reporting(E_ERROR | E_PARSE);class DomTree{    //DOM句柄    private $doc=null;    //保存基本解释    private $basic_meaning=array();    //保存英汉双解    private $en_or_ch=array();    //保存英英释义    private $en_to_en=array();    //保存例句    private $example=array();    //保存常用句型    private $sentences=array();    //保存词汇表    private $glossary=array();    //保存经典名人名言    private $auth=array();    //保存常见错误用法    private $use_in_wrong = array();    //保存近义词    private $approximate_words = array();    //保存百科解释    private $baike_trans = array();    public function __construct($source)    {        $this->doc = new DomDocument();        //判断$source类型        if(is_file($source))        {            file_exists($source)?$this->doc->loadHTMLFile($source):die("文件不存在");        }        else if(is_string($source))        {           empty($source)?die("传入的字符串不能为空"):$this->doc->loadHTML($source);        }        else        {            preg_match('#^(http|ftp)://#i', $source)?$this->doc->loadHTML(file_get_contents($source)):die("不支持的资源类型");        }        //获取div元素列表        $div_list = $this->doc->getElementsByTagName("div");        $div_list_len = $div_list->length;        for($i=0; $i<$div_list_len; $i++)        {            if($div_list->item($i)->hasAttribute("class"))            {                switch(trim($div_list->item($i)->getAttribute ("class")))                {                    case "basic clearfix":                        $this->getBasicMeans($div_list->item($i));                        break;                    case "layout dual":                        $this->getEnOrCh($div_list->item($i));                        break;                    case "layout en":                        $this->getEnToEn($div_list->item($i));                        break;                    case "layout sort":                        $this->getExample($div_list->item($i));                        break;                    case "layout patt":                        $this->normalSentence($div_list->item($i));                        break;                    case "layout coll":                        $this->getGlossary($div_list->item($i));                        break;                    case "layout auth":                        $this->getAuth($div_list->item($i));                        break;                    case "layout comn":                        $this->useInWrong($div_list->item($i));                        break;                    case "layout nfw":                        $this->getApproximateWords($div_list->item($i));                        break;                    case "layout baike";                        $this->getBaike($div_list->item($i));                        break;                }            }        }    }    //获取基本解释    private function getBasicMeans($basic_div)    {        $li_list = $basic_div->getElementsByTagName("li");        $li_list_len = $li_list->length;        for($i=0; $i<$li_list_len; $i++)        {            $item = $li_list->item($i);            if($item->hasAttribute("style"))            {                continue;            }            else            {                $strong_list  = $item->getElementsByTagName("strong");                $strong_list_len = $strong_list->length;                for($j=0; $j<$strong_list_len; $j++)                {                    $this->basic_meaning[]=$strong_list->item($j)->nodeValue;                }            }        }    }    //获取英汉双解释义    private function getEnOrCh($div_elem)    {        $li_list = $div_elem->getElementsByTagName("li");        $li_list_len = $li_list->length;        for($i=0; $i<$li_list_len; $i++)        {            $this->en_or_ch[]=$li_list->item($i)->nodeValue;        }    }    //获取英英释义    private function getEnToEn($div_elem)    {        $li_list = $div_elem->getElementsByTagName("li");        $li_list_len = $li_list->length;        for($i=0; $i<$li_list_len; $i++)        {            $this->en_to_en[]= $this->strip_Empty($li_list->item($i)->nodeValue);        }    }    //格式化操作    private function strip_Empty($string)    {        if(is_string($string))        {            return preg_replace('#\s{2,}#', ' ', $string);        }    }    //获取例句    private function getExample($div_elem)    {        if($div_elem->hasChildNodes())        {            $ol_list = $div_elem->getElementsByTagName("ol");            $ol_list_len = $ol_list->length;            for($i=0; $i<$ol_list_len; $i++)            {               $li_list = $ol_list->item($i)->getElementsByTagName("li");               $li_list_len = $li_list->length;               for($j=0; $j<$li_list_len; $j++)               {                   $this->example[] = $this->strip_Empty($li_list->item($j)->nodeValue);               }            }        }    }    //常见句型    private function normalSentence($div_elem)    {        $ol_list = $div_elem->getElementsByTagName("ol");        $ol_list_len = $ol_list->length;        for($i=0; $i<$ol_list_len; $i++)        {            //获取英语句型            $li_list = $ol_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $this->sentences[]=$this->strip_Empty($li_list->item($j)->nodeValue);            }        }    }    //常见词汇    private function getGlossary($div_elem)    {        $ul_list = $div_elem->getElementsByTagName("ul");        $ul_list_len = $ul_list->length;        for($i=0; $i<$ul_list_len; $i++)        {            //获取常见词汇            $li_list = $ul_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $this->glossary[]=$this->strip_Empty($li_list->item($j)->nodeValue);            }        }    }    //获取名人名言    private function getAuth($div_elem)    {        $ul_list = $div_elem->getElementsByTagName("ul");        $ul_list_len = $ul_list->length;        for($i=0; $i<$ul_list_len; $i++)        {            //获取列表            $li_list = $ul_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $this->auth[]=$this->strip_Empty($li_list->item($j)->nodeValue);            }        }    }    //获取常见错误用法    private function useInWrong($div_elem)    {        $ol_list = $div_elem->getElementsByTagName("ol");        $ol_list_len = $ol_list->length;        for($i=0; $i<$ol_list_len; $i++)        {            //获取错误用法列表            $li_list = $ol_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $this->use_in_wrong[]=$this->strip_Empty($li_list->item($j)->nodeValue);            }        }    }    //获取近义词    private function getApproximateWords($div_elem)    {        $ul_list = $div_elem->getElementsByTagName("ul");        $ul_list_len = $ul_list->length;        for($i=0; $i<$ul_list_len; $i++)        {            $li_list = $ul_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $a_list = $li_list->item($j)->getElementsByTagName("a");                $a_list_len = $a_list->length;                for($k=0; $k<$a_list_len; $k++)                {                    $this->approximate_words[]=$a_list->item($k)->nodeValue;                }            }        }    }    //获取百科解释    private function getBaike($div_elem)    {        $ul_list = $div_elem->getElementsByTagName("ul");        $ul_list_len = $ul_list->length;        for($i=0; $i<$ul_list_len; $i++)        {            //获取列表            $li_list = $ul_list->item($i)->getElementsByTagName("li");            $li_list_len = $li_list->length;            for($j=0; $j<$li_list_len; $j++)            {                $this->baike_trans[]=$li_list->item($j)->nodeValue;            }        }    }    //接口:  返回基本释义    public function getBasicMeaning()    {        if(!empty($this->basic_meaning))        {            return $this->basic_meaning;        }    }    //接口: 返回英汉双解    public function getEnOrChMeaning()    {        if(!empty($this->en_or_ch))        {            return $this->en_or_ch;        }    }    //接口:  返回英英释义    public function getEnToEnMeaning()    {        if(!empty($this->en_to_en))        {            return $this->en_to_en;        }    }     //接口:  返回例句    public function getExampleMeaning()    {        if(!empty($this->example))        {            return $this->example;        }    }    //接口:  返回常用句型    public function getNormalSentenceMeaning()    {        if(!empty($this->sentences))        {            return $this->sentences;        }    }    //接口:  返回词汇表    public function getGlossaryMeaning()    {        if(!empty($this->glossary))        {            return $this->glossary;        }    }    //接口:  返回名人名言    public function getAuthMeaning()    {        if(!empty($this->auth))        {            return $this->auth;        }    }    //接口:  返回常见错误用法    public function getUseInWrongMeaning()    {        if(!empty($this->use_in_wrong))        {            return $this->use_in_wrong;        }    }    //接口:  获取近义词    public function getApproximateWordsMeaning()    {        if(!empty($this->approximate_words))        {            return $this->approximate_words;        }    }    //接口: 获取百度百科的解释    public function getBaikeMeaning()    {        if(!empty($this->baike_trans))        {            return $this->baike_trans;        }    }    //返回所有的翻译    public function getAllMeaning()    {        $all_meaning = array();        $all_meaning['basic_meaning'] = $this->getBasicMeaning();        $all_meaning['en_or_ch'] = $this->getEnOrChMeaning();        $all_meaning['en_to_en'] = $this->getEnToEnMeaning();        $all_meaning['example']=$this->getExampleMeaning();        $all_meaning['normal_sentence'] = $this->getNormalSentenceMeaning();        $all_meaning['glossary_sentence'] = $this->getGlossaryMeaning();        $all_meaning['auth_sentence'] = $this->getAuthMeaning();        $all_meaning['wrong_use'] = $this->getUseInWrongMeaning();        $all_meaning['approximate_words'] = $this->getApproximateWordsMeaning();        $all_meaning['baike_meaning'] = $this->getBaikeMeaning();        return $all_meaning;    }}$dom = new DomTree("./com.html");$trans = $dom->getAllMeaning();echo "<pre>";print_r($trans);?>



结果如下:


Array(    [basic_meaning] => Array        (            [0] => 单词;消息;话语;诺言            [1] => 用词语表达        )    [en_or_ch] => Array        (            [0] => [C] 字,词 the smallest unit of spoken language which has meaning and can stand alone            [1] => [C] (说的)话,话语,言语 anything said; remark or statement            [2] => [S] 消息,信息; 谣言 piece of news; message; rumour            [3] => [S] 口令,号令; 命令 spoken command or signal            [4] => [S] 诺言,保证 a promise            [5] => vt. 用词语表达; 选用 express (sth) in particular words; phrase sth        )    [en_to_en] => Array        (            [0] => a unit of language that native speakers can identify; "words are the blocks from which sentences are made" "he hardly said ten words all morning"             [1] => a brief statement; "he didn't say a word about it"             [2] => information about recent and important events; "they awaited news of the outcome"             [3] => a verbal command for action; "when I give the word, charge!"             [4] => an exchange of views on some topic; "we had a good discussion" "we had a word or two about it"             [5] => a promise; "he gave his word"             [6] => a word is a string of bits stored in computer memory; "large computers use words up to 64 bits long"             [7] => the divine word of God; the second person in the Trinity (incarnate in Jesus)             [8] => a secret word or phrase known only to a restricted group; "he forgot the password"             [9] => the sacred writings of the Christian religions; "he went to carry the Word to the heathen"             [10] => put into words or an expression; "He formulated his concerns to the board of trustees"         )    [example] => Array        (            [0] => Could we have a word before you go to the meeting? 你去开会之前,咱们能私下说句话吗?            [1] => My friend sent word that he was well. 我朋友捎来口信说他很好。        )    [normal_sentence] => Array        (            [0] =>  What does this word mean? 这个词是什么意思?             [1] =>  I couldn't look up the spelling of the word, as I hadn't a dictionary at hand. 我没法查这个词的拼写,因为我手边没有词典。             [2] =>  Many English words are derived from Latin. 许多英文单词源于拉丁文。             [3] =>  All the words beside the central idea should be crossed out. 凡偏离中心思想的词语都应通通删掉。             [4] =>  The editor eliminated slang words from the essay. 编辑将俚语从这篇文章中剔除。             [5] =>  These words can't be staled by repetition. 这些词语不会因为经常使用而变成陈词滥调。             [6] =>  He gave me his visiting card, with a few words in pencil. 他把他的名片给我,上面有几个铅笔字。             [7] =>  I don't believe a word of his story. 他说的这件事我一句话都不相信。             [8] =>  At the press conference, the reporters copied down every word spoken by the prime minister. 在新闻发布会上,记者们逐字记下了首相的讲话。             [9] =>  Tell me what happened in your words. 用你自己的话把发生的事告诉我。             [10] =>  Deeds are better than words when people are in need of help. 当别人需要帮助时,行动胜于语言。             [11] =>  I would like a word with you. 我想和你谈谈。             [12] =>  After a word with the colonel he went away . 他和上校简单谈过之后就走了。             [13] =>  There's been no word from her for weeks. 已经有好几个星期没有她的音信了。             [14] =>  Word came that I was needed at home. 有信儿来说家里需要我。             [15] =>  Word has come that meeting will be held on Tuesday. 通知已到,星期二开会。             [16] =>  Word is that the election will be held in June. 有消息说选举将在六月份举行。             [17] =>  Word is that he's left the country. 据说他已经离开这个国家了。             [18] =>  Word got round that he had resigned. 谣传他已辞职。             [19] =>  Stay hidden until I give the word. 我不下令就藏着别动。             [20] =>  Their word is law. 他们的命令必须服从。             [21] =>  He gave the word and they let him in. 他说出了口令,他们让他进去了。             [22] =>  The word now is “freedom”. 现在的口号是“自由”。             [23] =>  I give you my word I'll go. 我向你保证,我会去的。             [24] =>  Stand by your word. 要守信用。             [25] =>  Hear The Word of God . 听宣讲《圣经》。             [26] =>  Be careful how you word your answer. 回答时要斟酌字句。             [27] =>  She worded the explanation well. 她的解释措辞得体。             [28] =>  The advice wasn't very tactfully worded. 这份通知措辞不太得体。             [29] =>  The suggestion might be worded more politely. 那项建议的措辞可以更婉转些。             [30] =>  This is a carefully worded contract. 这是一份措辞严谨的合同。         )    [glossary_sentence] => Array        (            [0] => address a few words 讲几句话            [1] => await word from sb 等待某人的消息            [2] => break one's words 食言            [3] => breathe a word 走漏消息            [4] => bring word 带来消息            [5] => choose a word 选择词            [6] => coin a word 杜撰一个词            [7] => cook up words 造新词            [8] => cross out a word 划掉一个词            [9] => cut out many words 删掉许多词            [10] => digest a word 消化一个词            [11] => doubt sb's words 怀疑某人的话            [12] => drink in all the words 吸收所有的词语            [13] => eat one's words 收回前言,认错,道歉            [14] => exchange angry words 发生口角            [15] => find words 找出言语(来表达)            [16] => gain the good word of 博得…的赞扬            [17] => get word 得到消息            [18] => get a word 插嘴            [19] => give one's word 保证,允许            [20] => give the word 发出命令            [21] => have words together 争吵            [22] => have words with sb 与某人吵嘴            [23] => have a word with sb 同某人谈一谈            [24] => hunt up a word 查一个词            [25] => keep one's word 信守诺言            [26] => leave word 留言            [27] => leave out a word 省略一个词,丢掉一个词            [28] => look up a word (在词典里)查一个词            [29] => memorize words 记单词            [30] => play on words 玩弄字眼            [31] => pronounce a word 读一个词            [32] => put in words for 为…说几句话            [33] => put the words into sb's mouth 教某人怎么讲            [34] => quote a word 引用一个词            [35] => receive word of 收到…消息            [36] => regret one's words 为说过的话而后悔            [37] => respect one's word 遵守自己许下的诺言            [38] => say a word 说句话,进一步,走漏消息            [39] => say a few words 说几句话            [40] => say a good word for sb 为某人说好话            [41] => send sb a word 给某人捎个信儿            [42] => spell a word 拼写一个词            [43] => stress the word 重读那个词            [44] => take back one's word 收回自己的话            [45] => take sb's word for it 相信了某人的话            [46] => understand a word 理解某个词的意思            [47] => use words 用词            [48] => waste one's words 白费口舌            [49] => weigh words 斟酌词句            [50] => write a word 写一个词            [51] => advance word 事先传出的消息            [52] => angry words 气话            [53] => beautiful words 优美的言辞            [54] => big words 大话            [55] => borrowed word 外来词            [56] => broken words 断断续续的话            [57] => burning words 热情洋溢的话            [58] => choice words 精选的词句            [59] => colorful words 丰富的言辞            [60] => cross words 气话            [61] => empty words 空洞的话,无意义的话            [62] => everyday word 日常用语            [63] => farewell words 送别词            [64] => fighting words 容易引起争论的话,挑战性的话            [65] => foreign word 外来词            [66] => hard words 愤怒的话,激烈的话            [67] => heated word 激烈的言词,争吵时使用的话            [68] => high words 愤怒的话,激烈的话            [69] => hollow words 虚假的言语            [70] => honeyed words 甜言蜜语            [71] => hot words 激烈的言词,争吵时使用的话            [72] => household word 家喻户晓的词            [73] => irresponsible words 不负责任的话            [74] => key words 关键的字眼            [75] => last words 临终遗言            [76] => living words 现代语            [77] => meaningful words 意味深长的言语            [78] => meaningless words 无意义的话            [79] => misspelled word 拼错的词            [80] => native word 本国词,本地词            [81] => pleasant words 动听的语言            [82] => regional word 方言            [83] => scientific word 科学用语            [84] => semi-technical words 半科技词            [85] => sharp words 愤怒的话,激烈的话            [86] => simple word 简单的词            [87] => sincere words 真诚的话            [88] => small word 小词            [89] => spoken words 口头语            [90] => suggestive words 含蓄的话            [91] => sweet words 甜言蜜语            [92] => tearful parting words 伤感的离别之言            [93] => the latest word 最新消息,最后消息            [94] => uncleanly words 下流话            [95] => unfamiliar word 生词            [96] => unusual word 冷僻词            [97] => warm words 忿怒的话,激烈的话            [98] => written words 书面语            [99] => wrong words 错词            [100] => dictionary word 词典里出现的词            [101] => English words 英语单词            [102] => law word 法律用语            [103] => newspaper word 新闻用语            [104] => slang word 俚语            [105] => at a word 立即,立刻            [106] => in a word 简言之,总之            [107] => in one's own words 用自己的话说            [108] => in other words 换言之            [109] => upon my word 的确,真的            [110] => without a word 一声没吭            [111] => word in heavy type 黑体字            [112] => words in season 时宜的话            [113] => words of comfort 安慰的话            [114] => words of command 命令            [115] => words of complaint 怨言            [116] => the W- of God 圣经            [117] => words of praise 表扬的话            [118] => word of six letters 六个字母的词            [119] => words of thanks 感谢的话            [120] => word the explanation 解释            [121] => word accurately 准确地用言语表达            [122] => word crudely 简单地用词语〔语言〕表达            [123] => word felicitously 恰当地用言语表达            [124] => word intelligibly 清楚地用语言表达            [125] => word positively 明确地用词语表达            [126] => word vaguely 含糊地表达            [127] => word well 措辞得体        )    [auth_sentence] => Array        (            [0] =>  Rome shall perishswrite that word In the blood that she has spilt. 出自:W. Cowper             [1] =>  We have striven..to draw some word from her; but she..answers nothing. 出自:G. P. R. James             [2] =>  To use his own words, he was in a cleft stick. 出自:H. Conway             [3] =>  Actions speak louder than words. 出自:Proverb             [4] =>  He words me, girls, he words me, that I should not Be noble to myself. 出自:Anthony Cleopatra,Shakespeare         )    [wrong_use] => Array        (            [0] => 我要跟他说句话。 误 I should like to have word with him. 正 I should like to have a word with him.             [1] => 他们听到消息说足球比赛将在今晚电视实况转播。 误 They had a word that the football match would be televised live this evening. 正 They had word that the football match would be televised live this evening. 析 have word是“听到消息〔新闻〕”的意思,“说句话”是have a word。             [2] => 对逐词背课文,我感到厌倦。 误 I was tired of reciting the texts word after word. 正 I was tired of reciting the texts word for word. 析 “一字不变地,逐字(背诵或翻译)”是word for word,不是word after word。             [3] => 我说了什么错话吗? 误 Have I said any wrong words? 正 Have I said anything wrong? 析 误句语法上没有错,但不符合英语习惯。             [4] => 他不遵守诺言。 误 He broke his words. 正 He broke his word. 析 break one's word意为“不遵守诺言”, word在此短语中不用复数形式。             [5] => 我刚得知他到达的消息。 误 I have just received the word of his arrival. 正 I have just received word of his arrival.             [6] => 有消息传来说我们的篮球队赢了这场比赛。 误 The word came that our basketball team had won the match. 正 Word came that our basketball team had won the match. 析 作“消息”“信息”解时, word前不加冠词。             [7] => 他大约是30年前开始当教师的,换句话说,他当教师已经有30年了。 误 He began to work as a teacher some thirty years ago, in another word, he has been a teacher for thirty years. 正 He began to work as a teacher some thirty years ago, in other words, he has been a teacher for thirty years. 析 in other words是固定短语,意为“换句话说”。             [8] => 他带信给我说怀特先生不久将动身去美国。 误 He carried me words that Mr.White would soon leave for America. 正 He carried me word that Mr. White would soon leave for America. 析 word作“消息”“信”解时,是不可数名词,其后不可加s。             [9] => 今晨我们争吵了。 误 We had a word this morning. 正 We had words this morning.             [10] => 他们曾为鸡毛蒜皮的小事同邻居吵过嘴。 误 They had word with their neighbour over some trifles. 正 They had words with their neighbours over some trifles. 析 表示“同某人发生口角”时,用have words with sb, words用复数形式。             [11] => 他说的大话使我们都感到惊讶。 误 His big word surprised us all. 正 His big words surprised us all.             [12] => 我们绝不收回前言。 误 We should on no account eat our word. 正 We should on no account eat our words. 析 习语big words, eat one's words中, words词尾的s不可省。         )    [approximate_words] => Array        (            [0] => account            [1] => advice            [2] => chat            [3] => communication            [4] => declaration            [5] => edict            [6] => expression            [7] => message            [8] => notice            [9] => order            [10] => password            [11] => promise            [12] => remark            [13] => term            [14] => couch            [15] => explain            [16] => express            [17] => phrase            [18] => put            [19] => say            [20] => write        )    [baike_meaning] => Array        (            [0] => word:Microsoft Word,属于办公软件,人们日常生活都有可能接触到他,对他并不陌生。 简介 wordMicrosoft Word是微软公司的一个文字处理器应用程序。它最初是由Richard Bro…        ))


0 0