综合

img tainkai

自动辨别文本是不是utf-8的c#程序.

发表于2004/10/20 18:22:00  1600人阅读

分类: C#

private void FindNoUTFFile(string Path)
   {
    System.IO.StreamReader reader = null;
    StringBuilder sb;
    StringBuilder sb2;    
    DirectoryInfo Folder = new System.IO.DirectoryInfo(Path);
    DirectoryInfo[] subFolders = Folder.GetDirectories();
    for (int i=0;i<subFolders.Length;i++)
    {
     FindNoUTFFile(subFolders[i].FullName);
    }
    FileInfo[] subFiles = Folder.GetFiles();
    for(int j=0;j<subFiles.Length ;j++)
    {
     if(CheckFileType(subFiles[j].Extension.ToLower()))
     {          
      FileStream fs = new FileStream(subFiles[j].FullName , FileMode.Open,FileAccess.Read);
      sb = new StringBuilder();
      sb2 = new StringBuilder();
      bool bUtf8 =IsUTF8(fs);
      fs.Close();
      if (!bUtf8)
      {       
       reader = new System.IO.StreamReader(subFiles[j].FullName,System.Text.Encoding.UTF8);
       sb2.Append(reader.ReadToEnd());
       reader.Close();
       reader = new System.IO.StreamReader(subFiles[j].FullName, System.Text.Encoding.Default,true);     
       sb.Append(reader.ReadToEnd());
       reader.Close(); 
   }
      
      
     }
    }
    
   }
  
   //0000 0000-0000 007F - 0xxxxxxx  (ascii converts to 1 octet!)
   //0000 0080-0000 07FF - 110xxxxx 10xxxxxx    ( 2 octet format)
   //0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format) 

   private static bool IsUTF8(FileStream sbInputStream)
   {
    int   i;
    byte cOctets;  // octets to go in this UTF-8 encoded character
    byte chr;
    bool  bAllAscii= true;
    long iLen = sbInputStream.Length;

    cOctets= 0;
    for( i=0; i < iLen; i++ )
    {
     chr = (byte)sbInputStream.ReadByte();

     if( (chr & 0x80) != 0 ) bAllAscii= false;

     if( cOctets == 0 ) 
     {
      if( chr >= 0x80 )
      { 
       do
       {
        chr <<= 1;
        cOctets++;
       }
       while( (chr & 0x80) != 0 );

       cOctets--;                       
       if( cOctets == 0 ) return false; 
      }
     }
     else
     {
      if( (chr & 0xC0) != 0x80 )
      {
       return false;
      }
      cOctets--;                      
     }
    }

    if( cOctets > 0 )
    { 
     return false;
    }

    if( bAllAscii )
    {   
     return false;
    }

    return true;

   }
  }
   
 
 }

阅读全文
0 0

相关文章推荐

img
取 消
img