html2txt h2t.c

来源:互联网 发布:android 广告机源码 编辑:程序博客网 时间:2024/05/16 14:19
/*//////////////////////文件名:h2t.c v0.2作者:苏晓(suxiaojack)日期:2008.7用途:转换HTML内容为TXT文本许可 ( License ):GPLv0.2 处理Bug1、修正无法识别&#数字;问题 UNICODE=>GB23122、添加©和 ® 处理3、修正&处理死循环.v0.1//////////////////////*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <windows.h>#include <locale.h>UnicodeToGB2312(char* pOut,unsigned short uData){WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(WCHAR),NULL,NULL);return;}#define BUFSIZE 1024*1024*2char buf[1024*1024*20];char shadowbuf[1024*1024*20];char buffer[BUFSIZE];long size;int type=0;#define tocsize 14//这个东西太多了!常用的可能也就这些吧。//Windows控制台太变态!注册商标等一些符号无法输出! char* toc[tocsize]={"&nbsp;"," ","&lt;","<","&gt;",">","&quot;","/"","&amp;","&","&copy;","◎版权","&reg;","◎注册"};void usage(char** argv){char *us="用来转换html =>txt. ver0.2/n""suxiaojack写于2008.7/n";char *ue="tstart_in_tag_text:开始的Tag标记中的特征文字,好理解end_in_tag_text了。/n""jump_num:跳过几次开始找到的,默认为0./n""注意不支持正则式!未曾处理水印文字。/n";printf("%s",us);printf("使用方法:%s <file> [ <start_in_tag_text> [jump_num] <end_in_tag_text> ] /n",argv[0]);printf("%s",ue);};//strstr快速比较int ministrstr(char* s,char* f){char minibuf[16];memcpy(minibuf,s,15);minibuf[15]=0;return strstr(minibuf,f)-minibuf;};//strstr转换为小写快速比较。int ministrstri(char* s,char *f){char minibuf[16];memcpy(minibuf,s,15);minibuf[15]=0;strlwr(minibuf);return strstr(minibuf,f)-minibuf;}// 等标记转换int isintoc(char* streamstart){int i=0;int ret=0;while(i<tocsize){if(!ministrstr(streamstart,toc[i])){printf("%s",toc[i+1]);ret=strlen(toc[i]);break;};i+=2;};if(ret==0) //没有转换处理{printf("&");ret=1;};return ret;};int num2txt(char* numstart){char tmp[256];int pos=0;char* s=numstart;unsigned short word;char os[3];while( *s>='0' && *s <='9' ){tmp[pos++]=*s++;};tmp[pos]=0;word=atoi(tmp);memset(os,0,3);UnicodeToGB2312(os,word);printf("%s",os);//s是;跳过s++;return  s-numstart;}//文件全部进入缓存void read2buf(FILE* fp){buf[0]=0;size=0;while(!feof(fp)){fgets(buffer,sizeof(buffer),fp);strcat(buf+size,buffer);size+=strlen(buffer);};buf[size]=0;memcpy(shadowbuf,buf,size+1);strlwr(shadowbuf);};//找标记的开始位置。返回找到后'>'之后的第一个字符位置。int findstart(char* start,int jump){char* pos=shadowbuf;strlwr(start);do{pos=strstr(pos,start);if(pos-shadowbuf < 0 )return -1;pos++;}while(jump--);while(*pos++ != '>'){};return pos-shadowbuf;};//找标记的结束位置。返回找到后'<'之前的最后字符位置。int findend(char* end,int start){char* pos=shadowbuf+start;strlwr(end);pos=strstr(pos,end);if(pos-shadowbuf<0)return -1;while(*pos-- != '<'){};return pos-shadowbuf;};void printline(){switch(type){case 1:printf("%c",'/r');break;case 2:printf("%s","/r/n");break;case 3:printf("%c",'/n');break;default:break;};};//转换输出void h2t(char* s,int len){char* ss=s;while(ss-s<len){//判断一下文章换行符号类型if(type==0 && ( *ss=='/r'|| *ss=='/n')){if(*ss=='/r' &&*(ss+1)=='/n'){type=2;}else if(*ss=='/n'){type=3;}else{type=1;};};if(*ss!='<'){//非标记if(*ss=='&'){if(*(ss+1)=='#'){ss+=2;int may=num2txt(ss);ss+=may;}else{int may=isintoc(ss);if(may>0){ss+=may;};}}    else{printf("%c",*ss);ss++;};}else{//<script标记if(!ministrstri(ss,"<script")){ss++;findnext:while(*ss!='<' && ss-s <len){ss++;};if(ss-s>=len)break;while(ministrstri(ss,"</script")!=0 && ss-s<len){ss++;goto findnext;};if(ss-s>=len)break;while(*ss!='>')ss++;ss++;}else if(!ministrstri(ss,"<style")) //<style标记{ss++;findnext2:while(*ss!='<' && ss-s <len){ss++;};if(ss-s>=len)break;while(ministrstri(ss,"</style")!=0 && ss-s<len){ss++;goto findnext2;};if(ss-s>=len)break;while(*ss!='>')ss++;ss++;}else if(!ministrstri(ss,"</br>")){printline();ss+=5;}else if(!ministrstri(ss,"</p>")){printline();ss+=4;}else if(!ministrstri(ss,"<br>")){printline();ss+=4;}else //普通标记{while(*ss!='>' && ss-s<len){ss++;};if(ss-s>=len)break;ss++;};};};};int main(int argc,char *argv[]){FILE* fp=0;int start,end,jump;if(argc==2){fp=fopen(argv[1],"r");if(!fp){usage(argv);exit(0);};read2buf(fp);h2t(buf,size);fclose(fp);}else if(argc==4){fp=fopen(argv[1],"r");if(!fp){usage(argv);exit(0);};read2buf(fp);start=findstart(argv[2],0);    if(start<0){printf("can't find:%s/n",argv[2]);exit(1);};end=findend(argv[3],start);if(end<0){printf("can't find:%s/n",argv[3]);exit(1);}if(start<end){h2t(buf+start,end-start);}else{usage(argv);};fclose(fp);}else if(argc == 5){fp=fopen(argv[1],"r");if(!fp){usage(argv);exit(0);};read2buf(fp);jump=atoi(argv[3]);start=findstart(argv[2],jump);if(start<0){printf("can't find:%s/n",argv[2]);exit(1);};end=findend(argv[4],start);if(end<0){printf("can't find:%s/n",argv[4]);exit(1);}if(start<end){h2t(buf+start,end-start);}else{usage(argv);};fclose(fp);}else{usage(argv);}return 0;}/*与noblank联合使用h2t filename.htm |noblank >out.txt*/