如何自动判断url中汉字的编码格式
时间:2010-09-16 来源:神龙升空
参考了http://topic.csdn.net/u/20091105/15/0d54b7b2-38fe-4cdf-ae1b-5a1f07c26ea0.html帖子18楼的代码
修改了一下,满足了我实际的项目需求
using System;using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Web;
namespace testProgram
{
class MyEncoding
{
static void Main()
{
MyEncoding myEncoding = new MyEncoding();
//GB2312
string gb2312 = "http://www.baidu.com/s?wd=%B9%A4%B3%A7%B9%A9%B5%E7";
//utf8
string utf8 = "http://www.google.com.hk/search?hl=zh-CN&newwindow=1&safe=strict&q=%25abc%E4%B8%AD%E5%9B%BD%2C%3B&btnG=Google+%E6%90%9C%E7%B4%A2&aq=f&aqi=&aql=&oq=&gs_rfai=";
string ss = myEncoding.UrlDecode(gb2312);
string ss1 = myEncoding.UrlDecode(utf8);
Console.WriteLine(ss);
Console.WriteLine(ss1);
Console.ReadLine();
}
private string UrlDecode(string url)
{
string result = "";
byte[] buf = GetUrlCodingToBytes(url);
if (IsUTF8(buf))
{
result = HttpUtility.UrlDecode(url, Encoding.UTF8);
}
else
{
result = HttpUtility.UrlDecode(url, Encoding.GetEncoding("GB2312"));
}
return result;
}
private byte[] GetUrlCodingToBytes(string url)
{
StringBuilder sb = new StringBuilder();
int i = url.IndexOf('%');
while (i >= 0)
{
if (url.Length < i + 3)
{
break;
}
sb.Append(url.Substring(i, 3));
url = url.Substring(i + 3);
i = url.IndexOf('%');
}
string urlCoding = sb.ToString();
if (string.IsNullOrEmpty(urlCoding))
return new byte[0];
urlCoding = urlCoding.Replace("%", string.Empty);
int len = urlCoding.Length / 2;
byte[] result = new byte[len];
len *= 2;
for (int index = 0; index < len; index++)
{
string s = urlCoding.Substring(index, 2);
int b = int.Parse(s, System.Globalization.NumberStyles.HexNumber);
result[index / 2] = (byte)b;
index++;
}
return result;
}
private bool IsUTF8(byte[] buf)
{
int i;
byte cOctets; // octets to go in this UTF-8 encoded character
bool bAllAscii = true;
long iLen = buf.Length;
cOctets = 0;
for (i = 0; i < iLen; i++)
{
if ((buf[i] & 0x80) != 0) bAllAscii = false;
if (cOctets == 0)
{
if (buf[i] >= 0x80)
{
do
{
buf[i] <<= 1;
cOctets++;
}
while ((buf[i] & 0x80) != 0);
cOctets--;
if (cOctets == 0)
return false;
}
}
else
{
if ((buf[i] & 0xC0) != 0x80)
return false;
cOctets--;
}
}
if (cOctets > 0)
return false;
if (bAllAscii)
return false;
return true;
}
}
}
相关阅读 更多 +