boost::regex处理中文方法

时间：2010-11-18 来源：zhdrfirst

正则表达式库boost::regex在处理中文时，需要使用宽字符需要用到以下和宽字符有关的类：
1、wstring：
作为STL中和string相对应的类，专门用于处理宽字符串。方法和string都一样，区别是value_type是wchar_t。wstring类的对象要赋值或连接的常量字符串必须以L开头标示为宽字符。
2、wregex：
和regex相对应，专门处理宽字符的正则表达式类。同样可以使用regex_match()和regex_replace()等函数。regex_match()的结果需要放在wsmatch类的对象中。代码示例：

boost::wregex word_sep(L"[。！？]+");

boost::wsregex_token_iterator sentences(text.begin(),text.end(),word_sep,-1); boost::wsregex_token_iterator end;

注意，对于regex处理得到的结果，还必须转变为utf-8格式，才能在linux下面显示（如果在windows下，用MS VC++编译器，不需要转换）我使用的的转换方法：

const string CCodeTransformer::wstr2utf(wstring & strraw)
  {
      char *inbuf;
      char *outbuf;
      char *pchar;
      int length1;
      int length2;
      int rc;
      string result = "";
      length1 = 4*strraw.length();
      if (length1 == 0)
          return result;
      inbuf = (char *)malloc(length1 * sizeof(char)+1);
      memset(inbuf,0,length1+1);
      pchar = (char *)strraw.c_str();
      for(int i=0; i<=length1; i++)
         inbuf[i] = pchar[i];
      length2 = 2 * length1;
      outbuf = (char *) malloc (length2 * sizeof(char) + 1);
      memset(outbuf,0,length2);

      rc = code_convert("WCHAR_T","utf-8",inbuf, length1, outbuf, length2);

     result = outbuf;
     free(inbuf);
     free(outbuf);
     return result;
  }
const string CCodeTransformer::wstr2utf(const wstring & strraw)
{
     return wstr2utf(const_cast<wstring &> (strraw));
}

下面的方法来自解决Boost.Regex对中文支持不好的问题，我没有实验！
字符和宽字符的相互转换：
1、RTL的方法
//把字符串转换成宽字符串
    setlocale( LC_CTYPE, "" ); // 很重要，没有这一句，转换会失败。
    int iWLen= mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。（不包含字符串结束符）
    wchar_t *lpwsz= new wchar_t[iWLen+1];
    int i= mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。（转换后的字符串有结束符）
    wstring wsToMatch(lpwsz);
    delete []lpwsz;
//把宽字符串转换成字符串，输出使用
    int iLen= wcstombs( NULL, wsm[1].str().c_str(), 0 ); // 计算转换后字符串的长度。（不包含字符串结束符）
    char *lpsz= new char[iLen+1];
    int i= wcstombs( lpsz, wsm[1].str().c_str(), iLen ); // 转换。（没有结束符）
    lpsz[iLen] = '\0';
    string sToMatch(lpsz);
    delete []lpsz;
2、Win32 SDK的方法
//把字符串转换成宽字符串
    int iWLen= MultiByteToWideChar( CP_ACP, 0, sToMatch.c_str(), sToMatch.size(), 0, 0 ); // 计算转换后宽字符串的长度。（不包含字符串结束符）
    wchar_t *lpwsz= new wchar_t [iWLen+1];
    MultiByteToWideChar( CP_ACP, 0, sToMatch.c_str(), sToMatch.size(), lpwsz, iWLen ); // 正式转换。
    wsz[iWLen] = L'\0';
//把宽字符串转换成字符串，输出使用
    int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）
    char *lpsz= new char[iLen];
    WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
    sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值
对于wstring字符，在进行输入输出时，分别需要使用wifstream， wofstream，wcout等函数代码实例：

19 #include <iostream>
20 #include <string>
21 #include <fstream>
22 #include <boost/regex.hpp>
23 #include <locale>
24 #include "codetransformer.h"
25
26 using namespace std;
27
28 int main() {
29
30 CCodeTransformer codeTrans;
31 wstring text;
32 wstring sen;
33 locale lang("zh_CN.UTF-8");
34 //wcout.imbue(lang);

35 wifstream input;
36 input.open("test");
37 input.imbue(lang);
38 getline(input,text);
39 wofstream output;
40 output.open("result");
41 output.imbue(lang);
42 //ifstream input("test");

43 //istreambuf_iterator<char> beg(input), end1;

44 //string text(beg, end1);

45 //wcout<<text<<endl;

46 output<<text<<endl;
47 boost::wregex word_sep(L"[。！？]+");
48 boost::wsregex_token_iterator sentences(text.begin(),text.end(),word_sep,-1);
49 boost::wsregex_token_iterator end;
50 for(boost::wsregex_token_iterator begin=sentences;begin!=end;begin++){
51 output<<*begin<<endl;
52 sen=*begin;
53 string result= codeTrans.wstr2utf(sen);
54 cout<<result<<endl;
55 }
56
57 return 1;
58 }

有价值的链接：宽字符的介绍 MultiByteToWideChar 终于找到一个写Unicode 文本文件的方法了 C++如何使用wcout输出Unicode字符串关于wcout,wofstream的一些东西 wprintf 和 wcout