关于网页中编码gb2312,big5,gbk,utf-8识别的最新..
时间:2010-12-26 来源:yjm0573
关于网页中编码gb2312,big5,gbk,utf-8识别的最新方法的探讨和c++的实现
首先考虑utf-8编码的判断
utf-8编码的判断格式如下:
1字节 0xxxxxxx
2字节 110xxxxx 10xxxxxx
3字节 1110xxxx 10xxxxxx 10xxxxxx
4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
这是标准的utf-8编码格式,所以如果网页是utf-8网页,那么必然遵循这个规律
函数实现:
//judge the byte whether begin with binary 10
int Encoder::is_utf8_special_byte(unsigned char c)
{
unsigned special_byte = 0X02; //binary 00000010
if (c >> 6 == special_byte) {
return 1;
} else {
return 0;
}
}
int Encoder::is_utf8_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
unsigned two_byte = 0X06; //binary 00000110
unsigned three_byte = 0X0E; //binary 00001110
unsigned four_byte = 0X1E; //binary 00011110
unsigned five_byte = 0X3E; //binary 00111110
unsigned six_byte = 0X7E; //binary 01111110
int utf8_yes = 0;
int utf8_no = 0;
unsigned char k = 0;
unsigned char m = 0;
unsigned char n = 0;
unsigned char p = 0;
unsigned char q = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c>>5 == two_byte) {
k = (unsigned char)str[i+1];
if ( is_utf8_special_byte(k) ) {
utf8_yes++;
i += 2;
continue;
}
} else if (c>>4 == three_byte) {
m = (unsigned char)str[i+1];
n = (unsigned char)str[i+2];
if ( is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 3;
continue;
}
} else if (c>>3 == four_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 4;
continue;
}
} else if (c>>2 == five_byte) {
unsigned char k = (unsigned char)str[i+1];
unsigned char m = (unsigned char)str[i+2];
unsigned char n = (unsigned char)str[i+3];
unsigned char p = (unsigned char)str[i+4];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p) ) {
utf8_yes++;
i += 5;
continue;
}
} else if (c>>1 == six_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
p = (unsigned char)str[i+4];
q = (unsigned char)str[i+5];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p)
&& is_utf8_special_byte(q) ) {
utf8_yes++;
i += 6;
continue;
}
}
utf8_no++;
i++;
}
printf("%d %d\n", utf8_yes, utf8_no);
int ret = (100*utf8_yes)/(utf8_yes + utf8_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理:判断网页文本中符合utf-8规则的字数和不符合utf-8规则的字数
如果符合的字数超过90%,则判断为utf-8编码
其次应该是gb2312编码的判断,由于gb2312相对gbk和big5的编码范围要小,所以
在gb2312和gbk和big5之间,应该首先判断该网页文本是否是gb2312
函数实现:
int Encoder::is_gb2312_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gb2312_yes = 0;
int gb2312_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF7) {
k = (unsigned char)str[i+1];
if (k >= 0XA1 && k <= 0XFE) {
gb2312_yes++;
i += 2;
continue;
}
}
gb2312_no++;
i += 2;
}
printf("%d %d\n", gb2312_yes, gb2312_no);
int ret = (100*gb2312_yes)/(gb2312_yes+gb2312_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理:统计符合gb2312编码特征的字数和不符合gb2312编码特征的字数
如果符合的字数超过90%,则判断该网页文本为gb2312
再者应该判断big5编码,原因是因为gbk的范围要比big5的范围广
函数实现
int Encoder::is_big5_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int big5_yes = 0;
int big5_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF9) {
k = (unsigned char)str[i+1];
if ( k >= 0X40 && k <= 0X7E
|| k >= 0XA1 && k <= 0XFE) {
big5_yes++;
i += 2;
continue;
}
}
big5_no++;
i += 2;
}
printf("%d %d\n", big5_yes, big5_no);
int ret = (100*big5_yes)/(big5_yes+big5_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理同gb2312
最后是gbk的判断
函数实现
int Encoder::is_gbk_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gbk_yes = 0;
int gbk_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0X81 && c <= 0XFE) {
k = (unsigned char)str[i+1];
if (k >= 0X40 && k <= 0XFE) {
gbk_yes++;
i += 2;
continue;
}
}
gbk_no++;
i += 2;
}
printf("%d %d\n", gbk_yes, gbk_no);
int ret = (100*gbk_yes)/(gbk_yes+gbk_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理同gb2312和big5
最后关于gb18030:
好像我暂时未看到有用gb18030做网页编码的,所以对于gb18030的编码判断现在暂时忽略掉,如果以后遇到gb18030做网页编码的,再
做进一步考虑
首先考虑utf-8编码的判断
utf-8编码的判断格式如下:
1字节 0xxxxxxx
2字节 110xxxxx 10xxxxxx
3字节 1110xxxx 10xxxxxx 10xxxxxx
4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
这是标准的utf-8编码格式,所以如果网页是utf-8网页,那么必然遵循这个规律
函数实现:
//judge the byte whether begin with binary 10
int Encoder::is_utf8_special_byte(unsigned char c)
{
unsigned special_byte = 0X02; //binary 00000010
if (c >> 6 == special_byte) {
return 1;
} else {
return 0;
}
}
int Encoder::is_utf8_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
unsigned two_byte = 0X06; //binary 00000110
unsigned three_byte = 0X0E; //binary 00001110
unsigned four_byte = 0X1E; //binary 00011110
unsigned five_byte = 0X3E; //binary 00111110
unsigned six_byte = 0X7E; //binary 01111110
int utf8_yes = 0;
int utf8_no = 0;
unsigned char k = 0;
unsigned char m = 0;
unsigned char n = 0;
unsigned char p = 0;
unsigned char q = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c>>5 == two_byte) {
k = (unsigned char)str[i+1];
if ( is_utf8_special_byte(k) ) {
utf8_yes++;
i += 2;
continue;
}
} else if (c>>4 == three_byte) {
m = (unsigned char)str[i+1];
n = (unsigned char)str[i+2];
if ( is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 3;
continue;
}
} else if (c>>3 == four_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 4;
continue;
}
} else if (c>>2 == five_byte) {
unsigned char k = (unsigned char)str[i+1];
unsigned char m = (unsigned char)str[i+2];
unsigned char n = (unsigned char)str[i+3];
unsigned char p = (unsigned char)str[i+4];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p) ) {
utf8_yes++;
i += 5;
continue;
}
} else if (c>>1 == six_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
p = (unsigned char)str[i+4];
q = (unsigned char)str[i+5];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p)
&& is_utf8_special_byte(q) ) {
utf8_yes++;
i += 6;
continue;
}
}
utf8_no++;
i++;
}
printf("%d %d\n", utf8_yes, utf8_no);
int ret = (100*utf8_yes)/(utf8_yes + utf8_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理:判断网页文本中符合utf-8规则的字数和不符合utf-8规则的字数
如果符合的字数超过90%,则判断为utf-8编码
其次应该是gb2312编码的判断,由于gb2312相对gbk和big5的编码范围要小,所以
在gb2312和gbk和big5之间,应该首先判断该网页文本是否是gb2312
函数实现:
int Encoder::is_gb2312_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gb2312_yes = 0;
int gb2312_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF7) {
k = (unsigned char)str[i+1];
if (k >= 0XA1 && k <= 0XFE) {
gb2312_yes++;
i += 2;
continue;
}
}
gb2312_no++;
i += 2;
}
printf("%d %d\n", gb2312_yes, gb2312_no);
int ret = (100*gb2312_yes)/(gb2312_yes+gb2312_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理:统计符合gb2312编码特征的字数和不符合gb2312编码特征的字数
如果符合的字数超过90%,则判断该网页文本为gb2312
再者应该判断big5编码,原因是因为gbk的范围要比big5的范围广
函数实现
int Encoder::is_big5_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int big5_yes = 0;
int big5_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF9) {
k = (unsigned char)str[i+1];
if ( k >= 0X40 && k <= 0X7E
|| k >= 0XA1 && k <= 0XFE) {
big5_yes++;
i += 2;
continue;
}
}
big5_no++;
i += 2;
}
printf("%d %d\n", big5_yes, big5_no);
int ret = (100*big5_yes)/(big5_yes+big5_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理同gb2312
最后是gbk的判断
函数实现
int Encoder::is_gbk_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gbk_yes = 0;
int gbk_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0X81 && c <= 0XFE) {
k = (unsigned char)str[i+1];
if (k >= 0X40 && k <= 0XFE) {
gbk_yes++;
i += 2;
continue;
}
}
gbk_no++;
i += 2;
}
printf("%d %d\n", gbk_yes, gbk_no);
int ret = (100*gbk_yes)/(gbk_yes+gbk_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
实现原理同gb2312和big5
最后关于gb18030:
好像我暂时未看到有用gb18030做网页编码的,所以对于gb18030的编码判断现在暂时忽略掉,如果以后遇到gb18030做网页编码的,再
做进一步考虑
相关阅读 更多 +