字符串、汉字的拆分问题的解法

来源:互联网 发布:淘宝蚂蚁花呗怎么还款 编辑:程序博客网 时间:2024/05/03 22:23
import java.util.ArrayList;
import java.io.*;
import java.util.Iterator;

public class GoodSplitString
{
  public GoodSplitString(String s,int mod)
  {
    System.out.println("原始字符串为: "+s);
    split1(s,mod);
    split2(s,mod);
  }

 
  public void split1(String s,int mod)
  {
    System.out.println("不除掉乱码:split1(String,int)");
    final int MOD=mod;
    byte temp[] = new byte[MOD];
    ArrayList list = new ArrayList();
   
    try
    {
      byte[] stringToByte = s.getBytes("gb2312");
      for (int i = 0; i < stringToByte.length; i++)
      {
        if ((i + MOD) > stringToByte.length) //这是判断是否是最后几个字母
        {
          byte[] tt = new byte[stringToByte.length - i]; //这里要用一个新的数组,或清空temp也行
          System.arraycopy(stringToByte, i, tt, 0, stringToByte.length - i);
          list.add(new String(tt, "gb2312"));
          i = stringToByte.length;
        }
        else
        {
          System.arraycopy(stringToByte, i, temp, 0, MOD);
          list.add(new String(temp, "gb2312"));
          i = i + MOD - 1;
        }
      }

      Iterator ite = list.iterator();
      while (ite.hasNext())
      {
        System.out.println(ite.next());
      }
    }
    catch (UnsupportedEncodingException ex)
    {
      System.out.println("no gb2312 ");
    }

  }
 
  public void split2(String stringSrc,int mod)
  {
    System.out.println("除掉乱码:splist2(String,int)");
    ArrayList list=new ArrayList();
    int chLetter=0;
    byte stringToByte[];
    final int MOD;
    int k=0;
    byte temp[];
    try
    {
      MOD=mod;
      temp=new byte[MOD];
      stringToByte=stringSrc.getBytes("gbk");
      for(int i=0;i<stringToByte.length;i++)
      {
        if(stringToByte[i]<0)//根据GBK编码,汉字都是小于0,并分为2个byte存放
        {
          if(k<MOD-1)
          {
            temp[k]=stringToByte[i];
            temp[++k]=stringToByte[++i];
            k++;
           
            if(k==MOD)
            {
              list.add(new String(temp,"gbk"));
              k=0;
            }
          }
          else
          {
              byte emitLetter[]=new byte[2];
              emitLetter[0]=stringToByte[i];
              emitLetter[1]=stringToByte[i+1];
              i=i+1;
              chLetter=chLetter+2;
              System.out.println("除掉的汉字:" +(chLetter/2)+" "+new String(emitLetter,"gbk"));
           
          }
        }
       
        else
        {
          temp[k]=stringToByte[i];
          k++;
         
          if (k == MOD)
          {
            list.add(new String(temp, "gbk"));
            k = 0;
          }
        }
       
        if(i==stringToByte.length-1)
        {
          byte [] lastData=new byte[k];
          System.arraycopy(temp,0,lastData,0,k);
          list.add(new String(lastData,"gbk"));
        }
      }
    }
    catch (UnsupportedEncodingException ex)
    {
      System.out.println("no gbk");
    }
    Iterator ite=list.iterator();
    while(ite.hasNext())
      System.out.println(ite.next());
   
  }
  public static void main(String[] args)
  {
    String s="a=我,人e们e为中以35经4产1人ie为joe经1发pl";
    new GoodSplitString(s,6);
  }

}

测试结果为:

原始字符串为: a=我,人e们e为中以35经4产1人ie为joe经1发pl
不除掉乱码:split1(String,int)
a=我,?
?e们e?
?中以3
5经4产
1人ie?
?joe经
1发pl
除掉乱码:splist2(String,int)
除掉的汉字:1 人
除掉的汉字:2 中
除掉的汉字:3 以
除掉的汉字:4 为
除掉的汉字:5 发
a=我,e
们e为3
5经4产
1人iej
oe经1p
l