linux下可以打开所有编码的TXT文件并打印出来

来源:互联网 发布:黑马程序员培训多少钱 编辑:程序博客网 时间:2024/04/28 17:21


#include <string.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <memory.h>




#include<iconv.h>
#define OUTLEN 1024




static int g_iFdTextFile;
static unsigned char *g_pucTextFileMem;
static unsigned char *g_pucTextFileMemEnd;
static unsigned char *g_pucLcdFirstPosAtFile;
static unsigned char *g_pucLcdNextPosAtFile;




int code_convert(char *from_charset, char *to_charset, char *inbuf, int inlen, char *outbuf, int outlen)
{
iconv_t cd;


char **pin=&inbuf;
char **pout=&outbuf;
cd = iconv_open(to_charset, from_charset);
if(cd == 0)
return -1;
memset(outbuf,0,outlen);
if(iconv(cd,pin,&inlen,pout,&outlen)==-1)
return -1;
iconv_close(cd);
return 0;
}


int u2g(char *inbuf,int inlen,char *outbuf,int outlen)
{
return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);


}
int g2u(char *inbuf,int inlen,char *outbuf,size_t outlen)
{
return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);


}
int convert_g2u(char *in_gb2312)
{
char out[OUTLEN];
int rc = g2u(in_gb2312, strlen(in_gb2312), out, OUTLEN);
strcpy(in_gb2312,out);
// printf("%s\n", out);
return 0;
}


int convert_u2g(char *in_gb2312)
{
char out[OUTLEN];
int rc = u2g(in_gb2312, strlen(in_gb2312), out, OUTLEN);
strcpy(in_gb2312,out);
// printf("%s\n", out);
return 0;
}




int convert(char *in_gb2312)
{
char out[OUTLEN];
int rc = g2u(in_gb2312, strlen(in_gb2312), out, OUTLEN);
strcpy(in_gb2312,out);
// printf("%s\n", out);
return 0;
}






//---------------------------------------------------------------------------------
static int isUtf8Coding(unsigned char *pucBufHead)
{
const char aStrUtf8[]    = {0xEF, 0xBB, 0xBF, 0};
if (strncmp((const char*)pucBufHead, aStrUtf8, 3) == 0)
{
/* UTF-8 */
return 1;
}
else
{
return 0;
}
}


static int isUtf16leCoding(unsigned char *pucBufHead)
{
const char aStrUtf16le[] = {0xFF, 0xFE, 0};
if (strncmp((const char *)pucBufHead, aStrUtf16le, 2) == 0)
{
/* UTF-16 little endian */
return 1;
}
else
{
return 0;
}
}


static int isUtf16beCoding(unsigned char *pucBufHead)
{
const char aStrUtf16be[] = {0xFE, 0xFF, 0};

if (strncmp((const char*)pucBufHead, aStrUtf16be, 2) == 0)
{
/* UTF-16 big endian */
return 1;
}
else
{
return 0;
}
}


static int isAsciiCoding(unsigned char *pucBufHead)
{
const char aStrUtf8[]    = {0xEF, 0xBB, 0xBF, 0};
const char aStrUtf16le[] = {0xFF, 0xFE, 0};
const char aStrUtf16be[] = {0xFE, 0xFF, 0};

if (strncmp((const char*)pucBufHead, aStrUtf8, 3) == 0)
{
/* UTF-8 */
return 0;
}
else if (strncmp((const char*)pucBufHead, aStrUtf16le, 2) == 0)
{
/* UTF-16 little endian */
return 0;
}
else if (strncmp((const char*)pucBufHead, aStrUtf16be, 2) == 0)
{
/* UTF-16 big endian */
return 0;
}
else
{
return 1;
}
}




//---------------------------------------------------------------------------------
static int GetPreOneBits(unsigned char ucVal)
{
int i;
int j = 0;

for (i = 7; i >= 0; i--)
{
if (!(ucVal & (1<<i)))
break;
else
j++;
}
return j;


}


//---------------------------------------------------------------------------------
static int Utf8GetCodeFrmBuf(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{
#if 0
    对于UTF-8编码中的任意字节B,如果B的第一位为0,则B为ASCII码,并且B独立的表示一个字符;
    如果B的第一位为1,第二位为0,则B为一个非ASCII字符(该字符由多个字节表示)中的一个字节,并且不为字符的第一个字节编码;
    如果B的前两位为1,第三位为0,则B为一个非ASCII字符(该字符由多个字节表示)中的第一个字节,并且该字符由两个字节表示;
    如果B的前三位为1,第四位为0,则B为一个非ASCII字符(该字符由多个字节表示)中的第一个字节,并且该字符由三个字节表示;
    如果B的前四位为1,第五位为0,则B为一个非ASCII字符(该字符由多个字节表示)中的第一个字节,并且该字符由四个字节表示;


    因此,对UTF-8编码中的任意字节,根据第一位,可判断是否为ASCII字符;
    根据前二位,可判断该字节是否为一个字符编码的第一个字节; 
    根据前四位(如果前两位均为1),可确定该字节为字符编码的第一个字节,并且可判断对应的字符由几个字节表示;
    根据前五位(如果前四位为1),可判断编码是否有错误或数据传输过程中是否有错误。
#endif


int i;
int iNum;
unsigned char ucVal;
unsigned int dwSum = 0;


if (pucBufStart >= pucBufEnd)
{
/* 文件结束 */
return 0;
}


ucVal = pucBufStart[0];
iNum  = GetPreOneBits(pucBufStart[0]);


if ((pucBufStart + iNum) > pucBufEnd)
{
/* 文件结束 */
return 0;
}


if (iNum == 0)
{
/* ASCII */
*pdwCode = pucBufStart[0];
return 1;
}
else
{
ucVal = ucVal << iNum;
ucVal = ucVal >> iNum;
dwSum += ucVal;
for (i = 1; i < iNum; i++)
{
ucVal = pucBufStart[i] & 0x3f;
dwSum = dwSum << 6;
dwSum += ucVal;
}
*pdwCode = dwSum;
return iNum;
}
}


//---------------------------------------------------------------------------------
int OpenTextFile(char *pcFileName)
{
struct stat tStat;


g_iFdTextFile = open(pcFileName, O_RDONLY);
if (0 > g_iFdTextFile)
{
printf("can't open text file %s\n", pcFileName);
return -1;
}


if(fstat(g_iFdTextFile, &tStat))
{
printf("can't get fstat\n");
return -1;
}
g_pucTextFileMem = (unsigned char *)mmap(NULL , tStat.st_size, PROT_READ, MAP_SHARED, g_iFdTextFile, 0);
if (g_pucTextFileMem == (unsigned char *)-1)
{
printf("can't mmap for text file\n");
return -1;
}


g_pucTextFileMemEnd = g_pucTextFileMem + tStat.st_size;

if (isUtf8Coding(g_pucTextFileMem))//utf_8
{
g_pucLcdFirstPosAtFile = g_pucTextFileMem + 3;
return 0;
}else if(isUtf16leCoding(g_pucTextFileMem))//unicode 16小字节
{
g_pucLcdFirstPosAtFile = g_pucTextFileMem + 2;
return 1;


}
else if(isUtf16beCoding(g_pucTextFileMem))//unicode 16大字节
{
g_pucLcdFirstPosAtFile = g_pucTextFileMem + 2;
return 2;


}
else//ANSI
{
g_pucLcdFirstPosAtFile = g_pucTextFileMem + 0;
return 3;
}

#if 0
g_ptEncodingOprForFile = SelectEncodingOprForFile(g_pucTextFileMem);


if (g_ptEncodingOprForFile)
{
g_pucLcdFirstPosAtFile = g_pucTextFileMem + g_ptEncodingOprForFile->iHeadLen;
return 0;
}
else
{
return -1;
}
#endif


}
int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput,int outSize)
{


    if ( unic <= 0x0000007F )
    {
        // * U-00000000 - U-0000007F:  0xxxxxxx
        *pOutput     = (unic & 0x7F);
        return 1;
    }
    else if ( unic >= 0x00000080 && unic <= 0x000007FF )
    {
        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
        *(pOutput+1) = (unic & 0x3F) | 0x80;
        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;
        return 2;
    }
    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
    {
        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
        *(pOutput+2) = (unic & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;
        return 3;
    }
    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
    {
        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+3) = (unic & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;
        return 4;
    }
    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
    {
        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+4) = (unic & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;
        return 5;
    }
    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
    {
        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+5) = (unic & 0x3F) | 0x80;
        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;
        return 6;
    }


    return 0;
}


//---------------------------------------------------------------------------------
int main(int argc, char **argv)
{
unsigned int pdwGetCode;
unsigned long lUnic;
int i = 10;
int iCodeType;
int iStrSize;
char cNum;
char sNum[3];
char sStr[1024];




if(argc == 1)
{
printf("input file name!\n");
return -1;
}
g_pucLcdNextPosAtFile = g_pucLcdFirstPosAtFile;
iCodeType = OpenTextFile(argv[1]);
if(-1 == iCodeType)
{
printf("open file : %s error!\n",argv[1]);
return -1;
}


switch(iCodeType)/* 记事本支持 的四种编码方式 */
{
case 0://utf_8 可能直接打印
printf("%s", g_pucLcdFirstPosAtFile);
break;

case 1://utf_16 小字节序
g_pucLcdNextPosAtFile = g_pucLcdFirstPosAtFile;
while(g_pucLcdNextPosAtFile < g_pucTextFileMemEnd)
{
lUnic = (*(g_pucLcdNextPosAtFile + 1)) << 8 | (*g_pucLcdNextPosAtFile);
enc_unicode_to_utf8_one(lUnic, sStr, 10);
printf("%s",sStr);
memset(sStr,0,10);
g_pucLcdNextPosAtFile = g_pucLcdNextPosAtFile + 2;
}

break;

case 2://utf_16 大字节序
g_pucLcdNextPosAtFile = g_pucLcdFirstPosAtFile;
while(g_pucLcdNextPosAtFile < g_pucTextFileMemEnd)
{
lUnic = (*g_pucLcdNextPosAtFile) << 8 | (*(g_pucLcdNextPosAtFile + 1)) ;
enc_unicode_to_utf8_one(lUnic, sStr, 10);
printf("%s",sStr);
memset(sStr,0,10);
g_pucLcdNextPosAtFile = g_pucLcdNextPosAtFile + 2;
}
break;

case 3://ANSI
g_pucLcdNextPosAtFile = g_pucLcdFirstPosAtFile;
while (g_pucLcdNextPosAtFile < g_pucTextFileMemEnd)
{
if ((g_pucLcdNextPosAtFile + 1000) < g_pucTextFileMemEnd)
{
memcpy(sStr, g_pucLcdNextPosAtFile, 1000);
sStr[1000] = '\0';
convert_g2u(sStr);
printf("%s",sStr);
g_pucLcdNextPosAtFile = g_pucLcdNextPosAtFile + 1000;
}
else
{ iStrSize = g_pucTextFileMemEnd - g_pucLcdNextPosAtFile;
memcpy(sStr, g_pucLcdNextPosAtFile, iStrSize);
sStr[iStrSize] = '\0';
convert_g2u(sStr);
printf("%s\n",sStr);
break;
}

}
break;

default:
break;
}
return 0;
}
0 0
原创粉丝点击