C语言解释HTML文档
来源:互联网 发布:c语言大于等于并小于 编辑:程序博客网 时间:2024/06/02 00:07
在c语言编程中偶尔会碰到需要访问HTML文档,html为结构性文档,如果当作普通字符串用查找的方式去访问,往往满足不了访问需求,为解决这个问题,写下了这段代码。代码在vs 2005中调试通过。
/*****************************************************************
gohtml.h
作者:陈海龙
邮箱:bendows@gmail.com
网址:http://www.goserver.org http://hi.csdn.net/bendows
日期:2009年11月14日
*****************************************************************/
#ifndef GOHTML_H
#define GOHTML_H
enum TAG_ENUM
{
DONTDEL,XMLHEAD,REMARK,DOCTYPE,A,ACRONYM,ADDRESS,APPLET,AREA,ATTRIBUTE,B,BASE,BASEFONT,BDO,BGSOUND,BIG,BLOCKQUOTE,BODY,BR,BUTTON,CAPTION,CENTER,CITE,CLIENTINFORMATION,CLIPBOARDDATA,CODE,COL,COLGROUP,COMMENT,CURRENTSTYLE,CUSTOM,DATATRANSFER,DD,DEFAULTS,DEL,DFN,DIR,DIV,DL,DOCUMENT,DT,EM,EMBED,htmlEVENT,EXTERNAL,FIELDSET,FONT,FORM,FRAME,FRAMESET,HEAD,HISTORY,HN,HR,HTML,I,IFRAME,IMG,IMPLEMENTATION,IMPORT,INPUT,INS,ISINDEX,KBD,LABEL,LEGEND,LI,LINK,LISTING,LOCATION,MAP,MARQUEE,MENU,META,NAMESPACE,NAVIGATOR,NEXTID,NOBR,NOFRAMES,NOSCRIPT,OBJECT,OL,OPTGROUP,OPTION,P,PAGE,PARAM,PLAINTEXT,POPUP,PRE,Q,RT,RUBY,RULE,RUNTIMESTYLE,S,SAMP,SCREEN,SCRIPT,SELECT,SELECTION,SMALL,SPAN,STRIKE,STRONG,STYLE,STYLESHEET,SUB,SUP,TABLE,TBODY,TD,TEXTAREA,TEXTNODE,TEXTRANGE,TEXTRECTANGLE,TFOOT,TH,THEAD,TITLE,TR,TT,U,UL,USERPROFILE,VAR,WBR,WINDOW,XML,XMP
};
typedef struct TAG_TYPE
{
char* tagname;
int type;
int flag;
}tag_type;
//节点属性
typedef struct HTML_PRO
{
char* key;
char* value;
struct HTML_PRO* next;
}html_pro;
//tag标记
typedef struct HTML_TAG
{
char* tag;
char* tagname;
int tagtype;
int isend;
int flag;
struct HTML_PRO* pro;
struct HTML_TAG* next;
}html_tag;
//子节点
typedef struct HTML_CHILDNODES
{
int length;
struct HTML_NODE* nodes;
struct HTML_NODE* last;
}html_childNodes;
//节点
typedef struct HTML_NODE
{
html_tag* tag;
char* tagName;
char* name;
char* id;
char* innerHTML;
struct HTML_CHILDNODES* childNodes;
struct HTML_NODE* parentNode;
struct HTML_NODE* next;
}html_node;
//页面对象
typedef struct HTML_PAGE
{
char* pagecode;
html_childNodes *childNodes;
html_tag* tags;
html_tag* curtag;
void (*parserHTML)(void* page);
html_tag* (*html_createTag)();
html_pro* (*html_createPro)();
html_childNodes* (*html_createChildNodes)();
html_node* (*html_createNode)();
}html_page;
//节点路径
typedef struct PATH_LINK
{
int site;
struct PATH_LINK* next;
}path_link;
void gohtml_init(html_page* page);
void parserHTML(html_page* page);
html_childNodes* html_createChildNodes();
html_tag* html_createTag();
html_pro* html_createPro();
html_node* html_createNode();
void html_freeTag(html_tag* tag);
void html_freePro(html_pro* pro);
void html_freeNode(html_node* node);
void html_freeChildNodes(html_childNodes* childnode);
path_link* html_getpath(html_childNodes* childNodes,char* keyword,int type);
html_node* html_findnode(html_childNodes* childNodes,path_link* pathlink);
html_pro* html_findpro(html_node* node,char* proname);
#endif
/*****************************************************************
gohtml.c
作者:陈海龙
邮箱:bendows@gmail.com
网址:http://www.goserver.org http://hi.csdn.net/bendows
日期:2009年11月14日
*****************************************************************/
#include <string.h>
#include "stdio.h"
#include "gohtml.h"
tag_type tag_type_table_0[]={"!DOCTYPE",DOCTYPE,0,"?XML",XMLHEAD,0,"!--",REMARK,0,NULL};
tag_type tag_type_table_a[]={"A",A,1,"ACRONYM",ACRONYM,1,"ADDRESS",ADDRESS,1,"APPLET",APPLET,1,"AREA",AREA,0,"ATTRIBUTE",ATTRIBUTE,1,NULL};
tag_type tag_type_table_b[]={"B",B,1,"BASE",BASE,0,"BASEFONT",BASEFONT,0,"BDO",BDO,1,"BGSOUND",BGSOUND,0,"BIG",BIG,1,"BLOCKQUOTE",BLOCKQUOTE,1,"BODY",BODY,1,"BR",BR,0,"BUTTON",BUTTON,1,NULL};
tag_type tag_type_table_c[]={"CAPTION",CAPTION,1,"CENTER",CENTER,1,"CITE",CITE,1,"CLIENTINFORMATION",CLIENTINFORMATION,1,"CLIPBOARDDATA",CLIPBOARDDATA,1,"CODE",CODE,1,"COL",COL,0,"COLGROUP",COLGROUP,1,"COMMENT",COMMENT,1,"CURRENTSTYLE",CURRENTSTYLE,1,"CUSTOM",CUSTOM,1,NULL};
tag_type tag_type_table_d[]={"DATATRANSFER",DATATRANSFER,1,"DD",DD,0,"DEFAULTS",DEFAULTS,1,"DEL",DEL,1,"DFN",DFN,1,"DIR",DIR,1,"DIV",DIV,1,"DL",DL,1,"DOCUMENT",DOCUMENT,1,"DT",DT,0,NULL};
tag_type tag_type_table_e[]={"EM",EM,1,"EMBED",EMBED,1,"EVENT",htmlEVENT,1,"EXTERNAL",EXTERNAL,1,NULL};
tag_type tag_type_table_f[]={"FIELDSET",FIELDSET,1,"FONT",FONT,1,"FORM",FORM,1,"FRAME",FRAME,1,"FRAMESET",FRAMESET,1,NULL};
tag_type tag_type_table_h[]={"HEAD",HEAD,1,"HISTORY",HISTORY,1,"HN",HN,1,"HR",HR,0,"HTML",HTML,1,"HTML",HTML,1,NULL};
tag_type tag_type_table_i[]={"I",I,1,"IFRAME",IFRAME,0,"IMG",IMG,0,"IMPLEMENTATION",IMPLEMENTATION,1,"IMPORT",IMPORT,1,"INPUT",INPUT,0,"INS",INS,1,"ISINDEX",ISINDEX,0,NULL};
tag_type tag_type_table_k[]={"KBD",KBD,1,NULL};
tag_type tag_type_table_l[]={"LABEL",LABEL,1,"LEGEND",LEGEND,1,"LI",LI,1,"LINK",LINK,0,"LISTING",LISTING,1,"LOCATION",LOCATION,1,NULL};
tag_type tag_type_table_m[]={"MAP",MAP,1,"MARQUEE",MARQUEE,1,"MENU",MENU,1,"META",META,0,NULL};
tag_type tag_type_table_n[]={"NAMESPACE",NAMESPACE,1,"NAVIGATOR",NAVIGATOR,1,"NEXTID",NEXTID,1,"NOBR",NOBR,1,"NOFRAMES",NOFRAMES,1,"NOSCRIPT",NOSCRIPT,1,NULL};
tag_type tag_type_table_o[]={"OBJECT",OBJECT,1,"OL",OL,1,"OPTGROUP",OPTGROUP,1,"OPTION",OPTION,1,NULL};
tag_type tag_type_table_p[]={"P",P,1,"PAGE",PAGE,1,"PARAM",PARAM,1,"PLAINTEXT",PLAINTEXT,1,"POPUP",POPUP,1,"PRE",PRE,1,NULL};
tag_type tag_type_table_q[]={"Q",Q,1,NULL};
tag_type tag_type_table_r[]={"RT",RT,0,"RUBY",RUBY,1,"RULE",RULE,1,"RUNTIMESTYLE",RUNTIMESTYLE,1,NULL};
tag_type tag_type_table_s[]={"S",S,1,"SAMP",SAMP,1,"SCREEN",SCREEN,1,"SCRIPT",SCRIPT,1,"SELECT",SELECT,1,"SELECTION",SELECTION,1,"SMALL",SMALL,1,"SPAN",SPAN,1,"STRIKE",STRIKE,1,"STRONG",STRONG,1,"STYLE",STYLE,1,"STYLE",STYLE,1,"STYLESHEET",STYLESHEET,1,"SUB",SUB,1,"SUP",SUP,1,NULL};
tag_type tag_type_table_t[]={"TABLE",TABLE,1,"TBODY",TBODY,1,"TD",TD,1,"TEXTAREA",TEXTAREA,1,"TEXTNODE",TEXTNODE,1,"TEXTRANGE",TEXTRANGE,1,"TEXTRECTANGLE",TEXTRECTANGLE,1,"TFOOT",TFOOT,1,"TH",TH,1,"THEAD",THEAD,1,"TITLE",TITLE,1,"TR",TR,1,"TT",TT,1,NULL};
tag_type tag_type_table_u[]={"U",U,1,"UL",UL,1,"USERPROFILE",USERPROFILE,1,NULL};
tag_type tag_type_table_v[]={"VAR",VAR,1,NULL};
tag_type tag_type_table_w[]={"WBR",WBR,0,"WINDOW",WINDOW,0,NULL};
tag_type tag_type_table_x[]={"XML",XML,1,"XMP",XMP,1,NULL};
tag_type *tag_type_table[27]=
{
tag_type_table_0,
tag_type_table_a,
tag_type_table_b,
tag_type_table_c,
tag_type_table_d,
tag_type_table_e,
tag_type_table_f,
NULL,
tag_type_table_h,
tag_type_table_i,
NULL,
tag_type_table_k,
tag_type_table_l,
tag_type_table_m,
tag_type_table_n,
tag_type_table_o,
tag_type_table_p,
tag_type_table_q,
tag_type_table_r,
tag_type_table_s,
tag_type_table_t,
tag_type_table_u,
tag_type_table_v,
tag_type_table_w,
tag_type_table_x,
NULL,
NULL,
};
//page对象初始化
void gohtml_init(html_page* page)
{
page->parserHTML=parserHTML;
page->html_createChildNodes=html_createChildNodes;
page->html_createTag=html_createTag;
page->childNodes=NULL;
page->html_createPro=html_createPro;
page->html_createNode=html_createNode;
page->tags=NULL;
page->curtag=NULL;
}
//创建tag标记
html_tag* html_createTag()
{
html_tag* rettag=(html_tag*)malloc(sizeof(html_tag));
rettag->tagname=NULL;
rettag->next=NULL;
rettag->tagtype=0;
rettag->isend=0;
rettag->tag=NULL;
rettag->pro=NULL;
return rettag;
}
//创建元素属性
html_pro* html_createPro()
{
html_pro* retpro=(html_pro*)malloc(sizeof(html_pro));
retpro->key=NULL;
retpro->value=NULL;
retpro->next=NULL;
return retpro;
}
//创建节点
html_node* html_createNode()
{
html_node* node=(html_node*)malloc(sizeof(html_node));
node->childNodes=html_createChildNodes();
node->name=NULL;
node->tagName=NULL;
node->parentNode=NULL;
node->next=NULL;
node->id=NULL;
return node;
}
//创建子节点
html_childNodes* html_createChildNodes()
{
html_childNodes* childNodes=(html_childNodes*)malloc(sizeof(html_childNodes));
childNodes->length=0;
childNodes->last=NULL;
childNodes->nodes=NULL;
return childNodes;
}
//生成tag标记树
int html_gettag(html_page* page)
{
char* strin;
int i=0;
int yh_status=0;//引号状态 0-引号外;>0-引号内
int yh_type='"';//引号类型 ["] 或[']
html_tag* curtag=NULL;
char* strtag=NULL;
int space_count=0;//用于寻找tag的空格;
html_pro* curpro=NULL;//找到的当前属性
int nodestart=0;//0未发现"<" ;1,发现"<",节点开始
char curtagname[30];
int istext=0;// 0未发现游离态文本,>0发现游离态文本
int ischeckequ=0;//是否检查"="
int ischeckvalue=0;//是否检查值(属性的值)
int isscript=0;//是否脚本开始 1=脚本开始
int isremark=0;//是否注释 1=注释开始
strin=page->pagecode;
//page->pagenode=page->html_createChildNodes();
if(strin==NULL)return 0;
while(strin[i]!='/0')
{
//HTML注释
if(isremark==1)
{
if(strncmp(strin+i,"-->",3)==0)
{
isremark=0;
i++;
i++;
i++;
strtag=strin+i+3;
istext=0;
continue;
}
istext++;
i++;
continue;
}
//脚本块内不做任何处理
if(isscript==1)
{
if(strncmp(strin+i,"</SCRIPT>",9)==0 || strncmp(strin+i,"</script>",9)==0)
{
isscript=0;
continue;
}
istext++;
i++;
continue;
}
//引号内代码不解释
if(strin[i]=='"' || strin[i]=='/'')
{
yh_type=strin[i];
if(yh_status==0)
{
yh_status=yh_type;
}
else if(yh_type==yh_status)
{
yh_status=0;
}
if(ischeckvalue==1)//标签属性开始
{
ischeckvalue=0;
strtag=strin+i;
curpro->value=strtag;
}
}
if(yh_status!=0)
{
i++;
continue;
}
//获取所有标签
if(strin[i]=='<')
{
html_tag* newtag;
space_count=0;//清空寻找标志
if(strin[i+1]=='!' && strin[i+2]=='-' && strin[i+3]=='-')
{
//注释
isremark=1;
i=i+3;
continue;
}
if(istext>0)
{
//游离态文本
newtag=page->html_createTag();
if(curtag==NULL)
{
curtag=newtag;
page->tags=newtag;
}
else
{
curtag->next=newtag;
curtag=newtag;
}
newtag->tag=strtag;
strin[i]='/0';
}
istext=0;//游离文本标志清空
newtag=page->html_createTag();
if(curtag==NULL)
{
curtag=newtag;
page->tags=newtag;
}
else
{
curtag->next=newtag;
curtag=newtag;
}
if(nodestart==1)
{
//发现无结束的标签,自动结束
strin[i]='/0';
strtag=strin+i+1;
}
if(strin[i+1]=='/')
{
//结束标记 如: </head>
i++;
newtag->isend=1;
}
strtag=strin+i+1;
newtag->tag=strtag;
newtag->tagname=strtag;
nodestart=1;
}
else if((strin[i]==' ' ||strin[i]=='/t'|| strin[i]=='/r'|| strin[i]=='/n') && nodestart==1)
{
if(ischeckvalue==0)
{
int j=i+1;
if(strin[j]==' ' ||strin[j]=='/t'|| strin[j]=='/r'|| strin[j]=='/n')
{
i++;
continue;
}
if(space_count==0)
{
//保存tagName
strin[i]='/0';
strtag=strin+i+1;
}
if(1==1)
{
//保存属性
html_pro* pro;
pro=page->html_createPro();
if(curtag->pro==NULL)
{
curtag->pro=pro;
curpro=pro;
}
else
{
curpro->next=pro;
curpro=pro;
}
strin[i]='/0';
strtag=strin+i+1;
curpro->key=strtag;
}
space_count++;
ischeckequ=1;//检测等号
}
else
{
strin[i]='/0';
strtag=strin+i+1;
}
}
else if(ischeckequ==1 && strin[i]=='=') //检测到"="
{
ischeckequ=0;
ischeckvalue=1;
strtag=strin+i+1;
strin[i]='/0';
}
else if(ischeckvalue==1 && strin[i]!=' ' && strin[i]!='/t' && strin[i]!='/r' && strin[i]!='/n')
{
ischeckvalue=0;
strtag=strin+i;
curpro->value=strtag;
//
}
else if(strin[i]=='>' && nodestart==1)
{
int tagIndex=0;
int j=0;
tag_type* curtagtype;
strin[i]='/0';
nodestart=0;
if(curtag->tagname==NULL)
{
curtag->tagname=strtag;
strin[i]='/0';
strtag=strin+i+1;
}
strtag=strin+i+1;
istext=0;//游离文本标志清空
tagIndex=curtag->tagname[0]-64;
if(tagIndex<0 || tagIndex>26)
{
tagIndex=0;
}
curtagtype=tag_type_table[tagIndex];
while(curtagtype[j].tagname!=NULL)
{
if(strcmp(curtagtype[j].tagname,curtag->tagname)==0)
{
curtag->tagtype=curtagtype[j].type;
curtag->flag=curtagtype[j].flag;
break;
}
j++;
}
if(curtag->tagtype==SCRIPT && curtag->isend==0)
{
isscript=1;//脚本开始
}
//改结束的都要结束
ischeckequ=0;
ischeckvalue=0;
}
else if(nodestart==0 && !(strin[i]==' ' || strin[i]=='/r' || strin[i]=='/n'|| strin[i]=='/t'))
{
istext++;
}
if(nodestart==1)
{
//标签中
if(strin[i]>=97 && strin[i]<=122)
{
strin[i]=strin[i] & 0xdf; //小写转换成大写
}
}
i++;
}
page->curtag=page->tags;
return 1;
}
html_node* html_getnode(html_page* page,int* nlength)
{
html_node* node=NULL;
html_node* curnode=NULL;
int count=0;
int nodebegin=0;// nodebegin=1,node开始
if(page->curtag!=NULL)
{
while(page->curtag!=NULL)
{
html_tag* curtag=page->curtag;
if(page->curtag->isend==0)
{
count++;
nodebegin=1;
if(node==NULL)
{
curnode=html_createNode();
node=curnode;
}
else
{
curnode->next=html_createNode();
curnode=curnode->next;
}
//
curnode->tag=curtag;
curnode->tagName=curtag->tagname;
page->curtag=page->curtag->next;
if(page->curtag==NULL)
{
break;
}
if(curtag->tagtype==0)
{
//文本节点
}
else if(curtag->flag==0) //非容器节点
{
if(page->curtag->tagtype==curtag->tagtype && page->curtag->isend==1)
{
//节点立即结束
page->curtag=page->curtag->next;
if(page->curtag==NULL)break;
}
else
{
//无结束tag
}
}
else //容器节点
{
int n=0;
html_node* subnode;
if(page->curtag->tagtype==curtag->tagtype && page->curtag->isend==1)
{
//节点立即结束
}
subnode=html_getnode(page,&n);
if(subnode!=NULL)
{
if(subnode->tag->isend ==1 && curnode->tag->tagtype!=subnode->tag->tagtype)
{
printf("ok");
}
}
curnode->childNodes->nodes=subnode;
curnode->childNodes->length=n;
}
}
else
{
//当前标签结束
if(nodebegin==1 && page->curtag->tagtype==curnode->tag->tagtype)
{
nodebegin=0;
page->curtag=page->curtag->next;
if(page->curtag==NULL)
{
break;
}
}
else if(nodebegin==1 && curnode->tag->flag==1 && page->curtag->tagtype!=curnode->tag->tagtype)
{
page->curtag=page->curtag->next;
if(page->curtag==NULL)
{
break;
}
}
else //上级节点结束标签
{
*nlength=count;
return node;
}
}
}
}
*nlength=count;
return node;
}
//内存释放
void html_freePro(html_pro* pro)
{
html_pro* curpro=pro;
while(curpro!=NULL)
{
html_pro* oldpro=curpro;
curpro=curpro->next;
free(oldpro);
}
}
void html_freeTag(html_tag* tag)
{
html_tag* curtag=tag;
while(curtag!=NULL)
{
html_tag* ptag=curtag;
curtag=curtag->next;
html_freePro(ptag->pro);
free(ptag);
}
}
void html_freeNode(html_node* node)
{
html_node *curnode=node;
while(curnode!=NULL)
{
html_node* oldnode=curnode;
curnode=curnode->next;
html_freeChildNodes(oldnode->childNodes);
free(oldnode);
}
}
void html_freeChildNodes(html_childNodes* childnode)
{
html_freeNode(childnode->nodes);
free(childnode);
}
void parserHTML(html_page* page)
{
int nlength=0;
int ret=html_gettag(page);
page->childNodes=html_createChildNodes();
page->childNodes->nodes=html_getnode(page,&nlength);
page->childNodes->length=nlength;
}
/*
type 0:查找key,1:查找value;2,查找 tagType
*/
path_link* html_getpath(html_childNodes* childNodes,char* keyword,int type)
{
path_link* pathlink;
path_link* pathret;
html_node* curnode;
int n;
int i=0,j=0;
n=childNodes->length;
curnode=childNodes->nodes;
while(curnode!=NULL)
{
html_pro* curpro=curnode->tag->pro;
while(curpro!=NULL)
{
//查找属性
char* str;
str=curpro->key;
if(strcmp(str,keyword)==0)
{
//找到
pathlink=(path_link*)malloc(sizeof(path_link));
pathlink->next=NULL;
pathlink->site=i;
return pathlink;
}
curpro=curpro->next;
}
pathret=html_getpath(curnode->childNodes,keyword,type);
if(pathret!=NULL)
{
pathlink=(path_link*)malloc(sizeof(path_link));
pathlink->site=i;
pathlink->next=pathret;
return pathlink;
}
curnode=curnode->next;
i++;
}
return NULL;
}
//查找节点
html_node* html_findnode(html_childNodes* childNodes,path_link* pathlink)
{
html_node* findnode=NULL;
path_link* curpath=pathlink;
html_childNodes* curchildnode=childNodes;
while(curpath!=NULL)
{
int i=0;
int n=curpath->site;
html_node* curnode=curchildnode->nodes;
for(i=0;i<n;i++)
{
if(curnode==NULL)
{
return NULL;
}
curnode=curnode->next;
if(curnode==NULL)return NULL;
}
if(curnode==NULL)
{
return NULL;
}
curchildnode=curnode->childNodes;
if(curpath->next==NULL)
{
return curnode;
}
curpath=curpath->next;
}
return findnode;
}
//查找节点的属性
html_pro* html_findpro(html_node* node,char* proname)
{
html_pro* curpro=node->tag->pro;
while(curpro!=NULL)
{
if(strcmp(curpro->key,proname)==0)
{
return curpro;
}
curpro=curpro->next;
}
return NULL;
}
void main()
{
html_page mypage;
char strhtml[1024];
sprintf(strhtml,"<html><head></head><body><table><tr><td>hello world!</td></tr></table></body></html>");
mypage.pagecode=strhtml;
gohtml_init(&mypage);
mypage.parserHTML(&mypage);
printf("解释完成!mypage对象为解释结果!");
}
- C语言解释HTML文档
- 解释运行c语言
- C语言魔王语言解释
- c语言学习文档
- C语言错误信息中文解释
- C语言复杂声明解释
- C语言错误信息解释:中文
- c语言static关键字解释
- C语言解释器LUA
- xrc C语言解释器
- SylixOS C语言解释器
- c语言数组名解释
- 复习HTML C语言
- [数据结构]魔王语言解释 c语言实现
- C语言如何解释a<b<c
- c语言解析xml文档
- C语言scanf函数详细解释
- C语言的stdio解释(4)
- 理解Tomcat workers
- setup.py 例子
- 求整数中比特为1的二进制位数
- 网页制作的小技巧
- QuickCSharp框架开发(12)------实现一个具体的验证提供者工厂对象AuthenticationProviderFactory
- C语言解释HTML文档
- 配置Apache Http Server2.2.4与Tomcat6.014
- 用JS写一个日历
- 如何加密解密DataSet数据集
- 同时安装sql2000和2005
- SQLSERVER2000一些内置存储过程用法和说明
- 使用Windows 7 和Ubuntu9.10双系统,真实感受
- 如何加密解密文件
- java? net