C语言解释HTML文档

来源:互联网 发布:c语言大于等于并小于 编辑:程序博客网 时间:2024/06/02 00:07

     在c语言编程中偶尔会碰到需要访问HTML文档,html为结构性文档,如果当作普通字符串用查找的方式去访问,往往满足不了访问需求,为解决这个问题,写下了这段代码。代码在vs 2005中调试通过。

/*****************************************************************
gohtml.h
作者:陈海龙
邮箱:bendows@gmail.com
网址:http://www.goserver.org  http://hi.csdn.net/bendows
日期:2009年11月14日
*****************************************************************/


#ifndef GOHTML_H
#define GOHTML_H


enum TAG_ENUM
{
 DONTDEL,XMLHEAD,REMARK,DOCTYPE,A,ACRONYM,ADDRESS,APPLET,AREA,ATTRIBUTE,B,BASE,BASEFONT,BDO,BGSOUND,BIG,BLOCKQUOTE,BODY,BR,BUTTON,CAPTION,CENTER,CITE,CLIENTINFORMATION,CLIPBOARDDATA,CODE,COL,COLGROUP,COMMENT,CURRENTSTYLE,CUSTOM,DATATRANSFER,DD,DEFAULTS,DEL,DFN,DIR,DIV,DL,DOCUMENT,DT,EM,EMBED,htmlEVENT,EXTERNAL,FIELDSET,FONT,FORM,FRAME,FRAMESET,HEAD,HISTORY,HN,HR,HTML,I,IFRAME,IMG,IMPLEMENTATION,IMPORT,INPUT,INS,ISINDEX,KBD,LABEL,LEGEND,LI,LINK,LISTING,LOCATION,MAP,MARQUEE,MENU,META,NAMESPACE,NAVIGATOR,NEXTID,NOBR,NOFRAMES,NOSCRIPT,OBJECT,OL,OPTGROUP,OPTION,P,PAGE,PARAM,PLAINTEXT,POPUP,PRE,Q,RT,RUBY,RULE,RUNTIMESTYLE,S,SAMP,SCREEN,SCRIPT,SELECT,SELECTION,SMALL,SPAN,STRIKE,STRONG,STYLE,STYLESHEET,SUB,SUP,TABLE,TBODY,TD,TEXTAREA,TEXTNODE,TEXTRANGE,TEXTRECTANGLE,TFOOT,TH,THEAD,TITLE,TR,TT,U,UL,USERPROFILE,VAR,WBR,WINDOW,XML,XMP
};

typedef struct TAG_TYPE
{
 char* tagname;
 int type;
 int flag;
}tag_type;


//节点属性
typedef struct HTML_PRO
{
 char* key;
 char* value;
 struct HTML_PRO* next;
}html_pro;
//tag标记
typedef struct HTML_TAG
{
 char* tag;
 char* tagname;
 int tagtype;
 int isend;
 int flag;
 struct HTML_PRO* pro;
 struct HTML_TAG* next;
}html_tag;
//子节点
typedef struct HTML_CHILDNODES
{
 int length;
 struct HTML_NODE* nodes;
 struct HTML_NODE* last; 
}html_childNodes;
//节点
typedef struct HTML_NODE
{
 html_tag* tag;
 char* tagName;
 char* name;
 char* id;
 char* innerHTML;
 struct HTML_CHILDNODES* childNodes;
 struct HTML_NODE* parentNode;
 struct HTML_NODE* next;

}html_node;
//页面对象
typedef struct HTML_PAGE
{
 char* pagecode;
 html_childNodes *childNodes;
 html_tag* tags;
 html_tag* curtag;
 void (*parserHTML)(void* page);


 
 html_tag* (*html_createTag)();
 html_pro* (*html_createPro)();
 
 
 html_childNodes* (*html_createChildNodes)(); 
 html_node* (*html_createNode)();

}html_page;
//节点路径
typedef struct PATH_LINK
{
 int site;
 struct PATH_LINK* next;
}path_link;

void gohtml_init(html_page* page);
void parserHTML(html_page* page);
html_childNodes* html_createChildNodes();
html_tag* html_createTag();
html_pro* html_createPro();
html_node* html_createNode();

void html_freeTag(html_tag* tag);
void html_freePro(html_pro* pro);
void html_freeNode(html_node* node);
void html_freeChildNodes(html_childNodes* childnode);

path_link* html_getpath(html_childNodes* childNodes,char* keyword,int type);
html_node* html_findnode(html_childNodes* childNodes,path_link* pathlink);
html_pro* html_findpro(html_node* node,char* proname);

#endif

 

/*****************************************************************
gohtml.c
作者:陈海龙
邮箱:bendows@gmail.com
网址:http://www.goserver.org  http://hi.csdn.net/bendows
日期:2009年11月14日
*****************************************************************/


#include <string.h>
#include "stdio.h"
#include "gohtml.h"
tag_type tag_type_table_0[]={"!DOCTYPE",DOCTYPE,0,"?XML",XMLHEAD,0,"!--",REMARK,0,NULL};
tag_type tag_type_table_a[]={"A",A,1,"ACRONYM",ACRONYM,1,"ADDRESS",ADDRESS,1,"APPLET",APPLET,1,"AREA",AREA,0,"ATTRIBUTE",ATTRIBUTE,1,NULL};
tag_type tag_type_table_b[]={"B",B,1,"BASE",BASE,0,"BASEFONT",BASEFONT,0,"BDO",BDO,1,"BGSOUND",BGSOUND,0,"BIG",BIG,1,"BLOCKQUOTE",BLOCKQUOTE,1,"BODY",BODY,1,"BR",BR,0,"BUTTON",BUTTON,1,NULL};
tag_type tag_type_table_c[]={"CAPTION",CAPTION,1,"CENTER",CENTER,1,"CITE",CITE,1,"CLIENTINFORMATION",CLIENTINFORMATION,1,"CLIPBOARDDATA",CLIPBOARDDATA,1,"CODE",CODE,1,"COL",COL,0,"COLGROUP",COLGROUP,1,"COMMENT",COMMENT,1,"CURRENTSTYLE",CURRENTSTYLE,1,"CUSTOM",CUSTOM,1,NULL};
tag_type tag_type_table_d[]={"DATATRANSFER",DATATRANSFER,1,"DD",DD,0,"DEFAULTS",DEFAULTS,1,"DEL",DEL,1,"DFN",DFN,1,"DIR",DIR,1,"DIV",DIV,1,"DL",DL,1,"DOCUMENT",DOCUMENT,1,"DT",DT,0,NULL};
tag_type tag_type_table_e[]={"EM",EM,1,"EMBED",EMBED,1,"EVENT",htmlEVENT,1,"EXTERNAL",EXTERNAL,1,NULL};
tag_type tag_type_table_f[]={"FIELDSET",FIELDSET,1,"FONT",FONT,1,"FORM",FORM,1,"FRAME",FRAME,1,"FRAMESET",FRAMESET,1,NULL};
tag_type tag_type_table_h[]={"HEAD",HEAD,1,"HISTORY",HISTORY,1,"HN",HN,1,"HR",HR,0,"HTML",HTML,1,"HTML",HTML,1,NULL};
tag_type tag_type_table_i[]={"I",I,1,"IFRAME",IFRAME,0,"IMG",IMG,0,"IMPLEMENTATION",IMPLEMENTATION,1,"IMPORT",IMPORT,1,"INPUT",INPUT,0,"INS",INS,1,"ISINDEX",ISINDEX,0,NULL};
tag_type tag_type_table_k[]={"KBD",KBD,1,NULL};
tag_type tag_type_table_l[]={"LABEL",LABEL,1,"LEGEND",LEGEND,1,"LI",LI,1,"LINK",LINK,0,"LISTING",LISTING,1,"LOCATION",LOCATION,1,NULL};
tag_type tag_type_table_m[]={"MAP",MAP,1,"MARQUEE",MARQUEE,1,"MENU",MENU,1,"META",META,0,NULL};
tag_type tag_type_table_n[]={"NAMESPACE",NAMESPACE,1,"NAVIGATOR",NAVIGATOR,1,"NEXTID",NEXTID,1,"NOBR",NOBR,1,"NOFRAMES",NOFRAMES,1,"NOSCRIPT",NOSCRIPT,1,NULL};
tag_type tag_type_table_o[]={"OBJECT",OBJECT,1,"OL",OL,1,"OPTGROUP",OPTGROUP,1,"OPTION",OPTION,1,NULL};
tag_type tag_type_table_p[]={"P",P,1,"PAGE",PAGE,1,"PARAM",PARAM,1,"PLAINTEXT",PLAINTEXT,1,"POPUP",POPUP,1,"PRE",PRE,1,NULL};
tag_type tag_type_table_q[]={"Q",Q,1,NULL};
tag_type tag_type_table_r[]={"RT",RT,0,"RUBY",RUBY,1,"RULE",RULE,1,"RUNTIMESTYLE",RUNTIMESTYLE,1,NULL};
tag_type tag_type_table_s[]={"S",S,1,"SAMP",SAMP,1,"SCREEN",SCREEN,1,"SCRIPT",SCRIPT,1,"SELECT",SELECT,1,"SELECTION",SELECTION,1,"SMALL",SMALL,1,"SPAN",SPAN,1,"STRIKE",STRIKE,1,"STRONG",STRONG,1,"STYLE",STYLE,1,"STYLE",STYLE,1,"STYLESHEET",STYLESHEET,1,"SUB",SUB,1,"SUP",SUP,1,NULL};
tag_type tag_type_table_t[]={"TABLE",TABLE,1,"TBODY",TBODY,1,"TD",TD,1,"TEXTAREA",TEXTAREA,1,"TEXTNODE",TEXTNODE,1,"TEXTRANGE",TEXTRANGE,1,"TEXTRECTANGLE",TEXTRECTANGLE,1,"TFOOT",TFOOT,1,"TH",TH,1,"THEAD",THEAD,1,"TITLE",TITLE,1,"TR",TR,1,"TT",TT,1,NULL};
tag_type tag_type_table_u[]={"U",U,1,"UL",UL,1,"USERPROFILE",USERPROFILE,1,NULL};
tag_type tag_type_table_v[]={"VAR",VAR,1,NULL};
tag_type tag_type_table_w[]={"WBR",WBR,0,"WINDOW",WINDOW,0,NULL};
tag_type tag_type_table_x[]={"XML",XML,1,"XMP",XMP,1,NULL};
tag_type *tag_type_table[27]=
{
 tag_type_table_0,
 tag_type_table_a,
 tag_type_table_b,
 tag_type_table_c,
 tag_type_table_d,
 tag_type_table_e,
 tag_type_table_f,
 NULL,
 tag_type_table_h,
 tag_type_table_i,
 NULL,
 tag_type_table_k,
 tag_type_table_l,
 tag_type_table_m,
 tag_type_table_n,
 tag_type_table_o,
 tag_type_table_p,
 tag_type_table_q,
 tag_type_table_r,
 tag_type_table_s,
 tag_type_table_t,
 tag_type_table_u,
 tag_type_table_v,
 tag_type_table_w,
 tag_type_table_x,
 NULL,
 NULL,
};


//page对象初始化
void gohtml_init(html_page* page)
{
 page->parserHTML=parserHTML;
 page->html_createChildNodes=html_createChildNodes;
 page->html_createTag=html_createTag;
 page->childNodes=NULL;
 page->html_createPro=html_createPro;
 page->html_createNode=html_createNode;
 page->tags=NULL;
 page->curtag=NULL;
}

//创建tag标记
html_tag* html_createTag()
{
 html_tag* rettag=(html_tag*)malloc(sizeof(html_tag));
 rettag->tagname=NULL;
 rettag->next=NULL;
 rettag->tagtype=0;
 rettag->isend=0;
 rettag->tag=NULL;
 rettag->pro=NULL;
 return rettag;
}

//创建元素属性
html_pro* html_createPro()
{
 html_pro* retpro=(html_pro*)malloc(sizeof(html_pro));
 retpro->key=NULL;
 retpro->value=NULL;
 retpro->next=NULL;
 return retpro;
}
//创建节点
html_node* html_createNode()
{
 html_node* node=(html_node*)malloc(sizeof(html_node));
 node->childNodes=html_createChildNodes();
 node->name=NULL;
 node->tagName=NULL;
 node->parentNode=NULL;
 node->next=NULL;
 node->id=NULL;
 return node;

}
//创建子节点
html_childNodes* html_createChildNodes()
{
 html_childNodes* childNodes=(html_childNodes*)malloc(sizeof(html_childNodes));

 childNodes->length=0;
 childNodes->last=NULL;
 childNodes->nodes=NULL;
 return childNodes;

}
//生成tag标记树
int html_gettag(html_page* page)
{
 char* strin;
 int i=0;
 int yh_status=0;//引号状态 0-引号外;>0-引号内
 int yh_type='"';//引号类型 ["] 或[']

 html_tag* curtag=NULL;
 char* strtag=NULL;
 int space_count=0;//用于寻找tag的空格;

 html_pro* curpro=NULL;//找到的当前属性

 int nodestart=0;//0未发现"<" ;1,发现"<",节点开始
 char curtagname[30];

 int istext=0;// 0未发现游离态文本,>0发现游离态文本

 int ischeckequ=0;//是否检查"="
 int ischeckvalue=0;//是否检查值(属性的值)

 int isscript=0;//是否脚本开始 1=脚本开始

 int isremark=0;//是否注释 1=注释开始


 strin=page->pagecode;
 //page->pagenode=page->html_createChildNodes();
 
 if(strin==NULL)return 0;
 while(strin[i]!='/0')
 {
  //HTML注释
  if(isremark==1)
  {
   if(strncmp(strin+i,"-->",3)==0)
   {
    isremark=0;
    i++;
    i++;
    i++;
    strtag=strin+i+3;
    istext=0;
    continue;
   }
   istext++;
   i++;
   continue;
  }
  //脚本块内不做任何处理
  if(isscript==1)
  {
   if(strncmp(strin+i,"</SCRIPT>",9)==0 || strncmp(strin+i,"</script>",9)==0)
   {
    isscript=0;
    continue;
   }
   istext++;
   i++;
   continue;
  }
  //引号内代码不解释
  if(strin[i]=='"' || strin[i]=='/'')
  {
   yh_type=strin[i];
   if(yh_status==0)
   {
    yh_status=yh_type;
   }
   else if(yh_type==yh_status)
   {
    yh_status=0;
   }
   
   if(ischeckvalue==1)//标签属性开始
   {
    ischeckvalue=0;
    strtag=strin+i;
    curpro->value=strtag;
   }
  }
  if(yh_status!=0)
  {
   i++;
   continue;
  }
  //获取所有标签
  if(strin[i]=='<')
  {
   html_tag* newtag;
   space_count=0;//清空寻找标志
   if(strin[i+1]=='!' && strin[i+2]=='-'  && strin[i+3]=='-')
   {
    //注释
    isremark=1;
    i=i+3;
    continue;
   }
   if(istext>0)
   {
    //游离态文本
    newtag=page->html_createTag();
    if(curtag==NULL)
    {
     curtag=newtag;
     page->tags=newtag;
    }
    else
    {
     curtag->next=newtag;
     curtag=newtag;
    }
    
    newtag->tag=strtag;
    strin[i]='/0';
    
   }
   istext=0;//游离文本标志清空
   
   newtag=page->html_createTag();
   if(curtag==NULL)
   {
    curtag=newtag;
    page->tags=newtag;
   }
   else
   {
    curtag->next=newtag;
    curtag=newtag;
   }
   if(nodestart==1)
   {
    //发现无结束的标签,自动结束
    strin[i]='/0';
    strtag=strin+i+1;
   }

   if(strin[i+1]=='/')
   {
    //结束标记 如: </head>
    i++;
    newtag->isend=1;
   }
   
   strtag=strin+i+1;
   newtag->tag=strtag;
   newtag->tagname=strtag;

   nodestart=1;   
  }
  else if((strin[i]==' ' ||strin[i]=='/t'|| strin[i]=='/r'|| strin[i]=='/n') && nodestart==1)
  {
   if(ischeckvalue==0)
   {
    int j=i+1;
    if(strin[j]==' ' ||strin[j]=='/t'|| strin[j]=='/r'|| strin[j]=='/n')
    {
     i++;
     continue;
    }
    if(space_count==0)
    {
     //保存tagName
     strin[i]='/0';
     strtag=strin+i+1;
    }
    if(1==1)
    {
     //保存属性
     html_pro* pro;
     pro=page->html_createPro();
     if(curtag->pro==NULL)
     {
      curtag->pro=pro;
      curpro=pro;
     }
     else
     {
      curpro->next=pro;
      curpro=pro;
     }
     
     strin[i]='/0';
     strtag=strin+i+1;
     curpro->key=strtag;
    }
    space_count++;
    ischeckequ=1;//检测等号
   }
   else
   {
    strin[i]='/0';
    strtag=strin+i+1;
   }
  

  }
  else if(ischeckequ==1 && strin[i]=='=') //检测到"="
  {   
   ischeckequ=0;
   ischeckvalue=1;
   strtag=strin+i+1;
   strin[i]='/0';
  }
  else if(ischeckvalue==1 &&  strin[i]!=' ' && strin[i]!='/t' && strin[i]!='/r' && strin[i]!='/n')
  {
   ischeckvalue=0;
   strtag=strin+i;
   curpro->value=strtag;
   //
  }
  else if(strin[i]=='>' && nodestart==1)
  {
   int tagIndex=0;
   int j=0;
   tag_type* curtagtype;
   strin[i]='/0';
   nodestart=0;
   if(curtag->tagname==NULL)
   {
    curtag->tagname=strtag;
    strin[i]='/0';
    strtag=strin+i+1;
   }
   strtag=strin+i+1;
   istext=0;//游离文本标志清空
   tagIndex=curtag->tagname[0]-64;
   if(tagIndex<0 || tagIndex>26)
   {
    tagIndex=0;
    
   }
   curtagtype=tag_type_table[tagIndex];
   while(curtagtype[j].tagname!=NULL)
   {
    if(strcmp(curtagtype[j].tagname,curtag->tagname)==0)
    {
     curtag->tagtype=curtagtype[j].type;
     curtag->flag=curtagtype[j].flag;
     break;
    }
    j++;
   }
   if(curtag->tagtype==SCRIPT && curtag->isend==0)
   {
    isscript=1;//脚本开始
   }
   

   //改结束的都要结束
   ischeckequ=0;
   ischeckvalue=0;
  }
  else if(nodestart==0 && !(strin[i]==' ' || strin[i]=='/r' || strin[i]=='/n'|| strin[i]=='/t'))
  {
   istext++;
  }
  
  if(nodestart==1)
  {
   //标签中
   if(strin[i]>=97 && strin[i]<=122)
   {
    strin[i]=strin[i] & 0xdf; //小写转换成大写
   }
  }
  i++;
 }
 page->curtag=page->tags;
 return 1;

}
html_node* html_getnode(html_page* page,int* nlength)
{
 html_node* node=NULL;
 html_node* curnode=NULL;
 int count=0;
 int nodebegin=0;// nodebegin=1,node开始
 if(page->curtag!=NULL)
 {
  
  while(page->curtag!=NULL)
  {
   html_tag* curtag=page->curtag;
   if(page->curtag->isend==0)
   {
    count++;
    nodebegin=1;
    if(node==NULL)
    {
     curnode=html_createNode();
     node=curnode;
    }
    else
    {
     curnode->next=html_createNode();
     curnode=curnode->next;
    }
    //
    curnode->tag=curtag;
    curnode->tagName=curtag->tagname;
    page->curtag=page->curtag->next;
    if(page->curtag==NULL)
    {
     break;
    }

    if(curtag->tagtype==0)
    {
     //文本节点

    }
    else if(curtag->flag==0) //非容器节点
    {
     if(page->curtag->tagtype==curtag->tagtype && page->curtag->isend==1)
     {
      //节点立即结束
      page->curtag=page->curtag->next;
      if(page->curtag==NULL)break;
     }
     else
     {
      //无结束tag

     }
     
    }    
    else //容器节点
    {
     int n=0;
     html_node* subnode;
     if(page->curtag->tagtype==curtag->tagtype && page->curtag->isend==1)
     {
      //节点立即结束
     }
     subnode=html_getnode(page,&n);
     if(subnode!=NULL)
     {
      if(subnode->tag->isend ==1 && curnode->tag->tagtype!=subnode->tag->tagtype)
      {
       printf("ok");
      }
     }
     curnode->childNodes->nodes=subnode;
     curnode->childNodes->length=n;
     
    }
   }
   else
   {
    //当前标签结束
    if(nodebegin==1 && page->curtag->tagtype==curnode->tag->tagtype)
    {
     nodebegin=0;
    
     page->curtag=page->curtag->next;
     if(page->curtag==NULL)
     {
      break;
     }
    }
    else if(nodebegin==1 && curnode->tag->flag==1 && page->curtag->tagtype!=curnode->tag->tagtype)
    {
     page->curtag=page->curtag->next;
     if(page->curtag==NULL)
     {
      break;
     }
    }
    else //上级节点结束标签
    {
     *nlength=count;
     return node;
    }

   }
   

   
  }  
 }
 *nlength=count;
 return node;
 
}
//内存释放
void html_freePro(html_pro* pro)
{
 html_pro* curpro=pro;
 while(curpro!=NULL)
 {
  html_pro* oldpro=curpro;
  curpro=curpro->next;
  free(oldpro);
 }
}
void html_freeTag(html_tag* tag)
{
 html_tag* curtag=tag;
 while(curtag!=NULL)
 {
  html_tag* ptag=curtag;
  curtag=curtag->next;
  html_freePro(ptag->pro);
  free(ptag);
 }
}
void html_freeNode(html_node* node)
{
 html_node *curnode=node;
 while(curnode!=NULL)
 {
  html_node* oldnode=curnode;
  curnode=curnode->next;
  html_freeChildNodes(oldnode->childNodes);
  free(oldnode);

 }
}
void html_freeChildNodes(html_childNodes* childnode)
{
 html_freeNode(childnode->nodes);
 free(childnode);
}

void parserHTML(html_page* page)
{
 int nlength=0;
 int ret=html_gettag(page);
 page->childNodes=html_createChildNodes();
 page->childNodes->nodes=html_getnode(page,&nlength);
 page->childNodes->length=nlength;
 
 
}


/*
 type 0:查找key,1:查找value;2,查找 tagType
*/
path_link* html_getpath(html_childNodes* childNodes,char* keyword,int type)
{
 path_link* pathlink;
 path_link* pathret;
 html_node* curnode;
 int n;
 int i=0,j=0;
 

 n=childNodes->length;
 curnode=childNodes->nodes;
 while(curnode!=NULL)
 {
  html_pro* curpro=curnode->tag->pro;
  while(curpro!=NULL)
  {
   //查找属性
   char* str;
   str=curpro->key;
   if(strcmp(str,keyword)==0)
   {
    //找到
    pathlink=(path_link*)malloc(sizeof(path_link));
    pathlink->next=NULL;
    pathlink->site=i;
    return pathlink;
   }
   curpro=curpro->next;
  }
  pathret=html_getpath(curnode->childNodes,keyword,type);
  if(pathret!=NULL)
  {
   pathlink=(path_link*)malloc(sizeof(path_link));
   pathlink->site=i;
   pathlink->next=pathret;
   return pathlink;
  }
  curnode=curnode->next;
  i++;
 }


 return NULL;
}
//查找节点
html_node* html_findnode(html_childNodes* childNodes,path_link* pathlink)
{
 html_node* findnode=NULL;
 path_link* curpath=pathlink;
 html_childNodes* curchildnode=childNodes;
 while(curpath!=NULL)
 {
  int i=0;
  int n=curpath->site;
  html_node* curnode=curchildnode->nodes;
  for(i=0;i<n;i++)
  {   
   if(curnode==NULL)
   {
    return NULL;
   }
   curnode=curnode->next;
   if(curnode==NULL)return NULL;
  }
  if(curnode==NULL)
  {
   return NULL;
  }
  curchildnode=curnode->childNodes;
  
  if(curpath->next==NULL)
  {
   return curnode;
  }

  curpath=curpath->next;
 }
 

 return findnode;
}
//查找节点的属性
html_pro* html_findpro(html_node* node,char* proname)
{
 html_pro* curpro=node->tag->pro;
 while(curpro!=NULL)
 {
  if(strcmp(curpro->key,proname)==0)
  {
   return curpro;
  }
  curpro=curpro->next;
 }
 return NULL;
}
void main()
{
 html_page mypage;
 char strhtml[1024];
 sprintf(strhtml,"<html><head></head><body><table><tr><td>hello world!</td></tr></table></body></html>");
 mypage.pagecode=strhtml;
 gohtml_init(&mypage);
 mypage.parserHTML(&mypage);
 printf("解释完成!mypage对象为解释结果!");
}

原创粉丝点击