脏字过滤算法
时间:2010-11-25 来源:_Sin
程序包下载Word.rar
修改后
   public class DirtyWordOper
      {
          private static Dictionary<string, object> hash = new Dictionary<string, object>();
          private static BitArray firstCharCheck = new BitArray(char.MaxValue);//把脏词的第一个字符记录下来
          private static BitArray allCharCheck = new BitArray(char.MaxValue);//把每一个个脏词的所有字符都记录下来
          private static int maxLength = 0;//
          private static bool onlyOne = true;
          #region
          /// <summary>
          /// 返回替换后的字符串 字符串的长度不变
          /// </summary>
          /// <param name="text"></param>
          /// <returns></returns>
          public string Replace(string text)
          {
              if (onlyOne)
              {
                  Init();//初始化数据 执行一次就不会执行了
                  onlyOne = false;
              }
              if (!isDirtyword(text))
              {
                  return text;
              }
              //获取替换操作表
              List<DetailRepModel> drlist = GetList(text);
              //执行替换操作
              return Replace2(text, drlist);
          }
          /// <summary>
          /// 初始化用  只执行一次
          /// </summary>
          /// <param name="text"></param>
          private static void Init()
          {
              string[] badwords = DirtyWordData.DirtyKeyword.Split('|');
              foreach (string bw in badwords)
              {
                  string[] strarrtemp = bw.Split('&');
                  string word = strarrtemp[0];
                  word = word.Trim();//去掉数据中的空格及格式 符号
                  word = word.Replace("/r", "");
                  word = word.Replace("/n", "");
                  if (word == "")
                  {
                      break;
                  }
                  if (!hash.ContainsKey(word))
                  {
                      hash.Add(word, null);
                      maxLength = Math.Max(maxLength, word.Length);
                      firstCharCheck[word[0]] = true;
                      foreach (char c in word)
                      {
                          allCharCheck[c] = true;
                      }
                  }
              }
          }
          /// <summary>
          /// 是否包含 了 脏 词
          /// </summary>
          /// <param name="text"></param>
          /// <returns></returns>
          private static bool isDirtyword(string text)
          {
              int index = 0;
              //int offset = 0;
              while (index < text.Length)
              {
                  //如果第一个字符都不符合
                  if (!firstCharCheck[text[index]])
                  {// 直接找到与脏词第一字符相同为止
                      while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                  }
                  for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                  {
                      if (!allCharCheck[text[index + j - 1]])
                      {
                          break;
                      }
                      string sub = text.Substring(index, j);
                      //判定脏字字典中是否包括了脏词
                      if (hash.ContainsKey(sub))
                      {
                          return true;//是
                      }
                  }
                  index++;
              }
              return false;//否
          }
          /// <summary>
          /// 返回操作列表
          /// </summary>
          /// <param name="text"></param>
          /// <returns></returns>
          private static List<DetailRepModel> GetList(string text)
          {
              List<DetailRepModel> DetailList = new List<DetailRepModel>();
              int index = 0;
              while (index < text.Length)
              {
                  if (!firstCharCheck[text[index]])
                  {
                      while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                  }
                  DetailRepModel tempDetail = null;
                  for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                  {
                      if (!allCharCheck[text[index + j - 1]])
                      {
                          if (tempDetail != null)
                          {//优先先字符串替换
                              index = index + tempDetail.number - 1;//索引要返回上一位,所以要减1
                              DetailList.Add(tempDetail);
                          }
                          break;
                      }
                      string sub = text.Substring(index, j);
                      if (hash.ContainsKey(sub))
                      {
                          tempDetail = new DetailRepModel();
                          tempDetail.index = index;
                          tempDetail.number = sub.Length;
                          tempDetail.content = sub;
                          //break;//进行下一次 不然要出现, abc 其中ab 与a都关键字要生成两个操作                      
                      }
                      if (tempDetail != null)
                      {
                          if (j + 1 > Math.Min(maxLength, text.Length - index))
                          {//优先先字符串替换
                              DetailList.Add(tempDetail);
                              index = index + tempDetail.number - 1;//索引要返回上一位,所以要减1
                          }
                      }
                  }
                  index++;
              }
              return DetailList;
          }
          /// <summary>
          /// 传入 字串和 脏字替换操作表,
          /// </summary>
          /// <param name="text"></param>
          /// <param name="drlist"></param>
          /// <returns> 输出替换后的字串</returns>
          private static string Replace2(string text, List<DetailRepModel> drlist)
          {
  
              if (drlist == null || drlist.Count == 0 || text == "")
              {
                  return text;
              }
              foreach (DetailRepModel dr in drlist)
              {
                  if (dr != null)
                  {
                      string strtemp = text.Substring(dr.index, dr.number);
                      object ob = DirtyWordData.DirtyHT[(object)strtemp];
                      if (ob == null)
                      {
                          //记录错误
                          break;
                      }
                      // 这样替换 有错误 ,
                      text = text.Substring(0, dr.index) + ob.ToString() + text.Substring(dr.index + dr.number);
                      //text = text.Replace(strtemp, ob.ToString());
                  }
              }
              return text;
          }
          #endregion
      }
效果还行, 不过我们老大给我说了个方法更NB,说比这种要快50倍;只是写起来有点麻烦
   public interface IReplaceDW
      {
          string Replace(string s);
      }
      public class ReplaceDW
      {
          public static void AddToWords(DirtyChar parent, string s, string t)
          {
              DirtyChar dc = parent.Children.Find(o => o.Orienginal == s[0]);
              if (dc == null)
              {
                  dc = new DirtyChar() { Orienginal = s[0], Children = new List<DirtyChar>(), Target = "" };
                  parent.Children.Add(dc);
              }
              if (s.Length > 1)
              {//
                  AddToWords(dc, s.Substring(1), t);
              }
              else
              {
                  dc.Target = t;
              }
          }
          public static string BuildChildren(DirtyChar dc, int deepLevel)
          {
              StringBuilder sb = new StringBuilder();
              string spaces = new string(' ', deepLevel + 4);
              if (dc.Children.Count > 0)
              {
                  sb.Append(@"
  " + spaces + @"if (i + 1 == len){");
                  sb.Append(@"
  " + spaces + @"    sb.Append(""" + dc.Target + @""");
                  ");
                  sb.Append(@"
  " + spaces + @"    i++;
  " + spaces + @"    break;}");
                  sb.Append(@"
  " + spaces + @" switch (s[i + " + deepLevel.ToString() + @"])
  " + spaces + @" {
  ");
                  foreach (DirtyChar c in dc.Children)
                  {
                      sb.Append(@"
  " + spaces + @"  case '" + c.Orienginal + @"':
  ");
                      sb.Append(BuildChildren(c, deepLevel + 1));
                      sb.Append(@"
  " + spaces + @"   break;");
                  }
                
                
                  sb.Append(@"
  " + spaces + @" default:
  " + spaces + @"    sb.Append(""" + dc.Target + @""");
  " + spaces + @"    i++;
  " + spaces + @"    break;
  " + spaces + @" }
  ");
              }
              else
              {
                  sb.Append(@"
  " + spaces + @"  sb.Append(""" + dc.Target + @""");
  ");
                  if (deepLevel == 1)
                  {
                      sb.Append(@"
  " + spaces + @"  i++;
  ");
                  }
                  else
                  {
                      sb.Append(@"
  " + spaces + @"  i += " + (deepLevel).ToString() + @";
  ");
                  }
              }
              return sb.ToString();
          }
  
          private IReplaceDW _r = null;
          private static bool isfirst = true;
          public string Replace(string s)
          {
              return _r.Replace(s);
          }
          private static List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
          public ReplaceDW()
          {
              if (isfirst)
              {              
                  List<KeyValuePair<string, string>> dict = new List<KeyValuePair<string, string>>();
                  foreach (DictionaryEntry d in KeyWord.DirtyWordData.DirtyHT)
                  {
                      dict.Add(new KeyValuePair<string, string>(d.Key.ToString(), d.Value.ToString()));
                  }
                  // 整理进 list
                  //List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
                  foreach (KeyValuePair<string, string> kv in dict)
                  {
                      tmp.Add(kv);
                  }
                  // 倒排
                  tmp.Sort((a, b) => { return b.Key.CompareTo(a.Key); });
                  isfirst = false;
              }
              var compiler = new CSharpCodeProvider();
              var options = new CompilerParameters();
              // set compile options  
              options.CompilerOptions = "/o";
              options.GenerateExecutable = false;
              options.GenerateInMemory = true;
              options.ReferencedAssemblies.Add("System.dll");
              options.ReferencedAssemblies.Add(this.GetType().Assembly.Location);
              // set the source code to compile  
              DirtyChar words = new DirtyChar() { Children = new List<DirtyChar>() };
              //DirtyChar words2 = new DirtyChar();
              //words2.Children = new List<DirtyChar>();
              foreach (KeyValuePair<string, string> kv in tmp)
              {//构建字典表
                  AddToWords(words, kv.Key, kv.Value);
              }
  
              StringBuilder sb = new StringBuilder();
              sb.Append(@"
  using System;  
  namespace KeyWord
  {
  public class ReplaceDW_ : IReplaceDW
  {  
      public string Replace( string s )
   {  
    int len = s.Length, i = 0;
          System.Text.StringBuilder sb = new System.Text.StringBuilder(len);
  ");
              sb.Append(@"
    while (i < len)
    {
     switch (s[i])
     {
  ");
              foreach (DirtyChar c in words.Children)
              {
                  sb.Append(@"
      case '" + c.Orienginal + @"':
  ");
                  sb.Append(BuildChildren(c, 1));
                  sb.Append(@"
       break;");
              }
              sb.Append(@"
      default:
       sb.Append(s[i++]);
       break;
     }
    }
  ");
              sb.Append(@"
    return sb.ToString();
      }  
  }
  }");
              // compile the code, on-the-fly  
              var result = compiler.CompileAssemblyFromSource(options, sb.ToString());
             
              foreach (var error in result.Errors)
              {
                  // print errors  
                  ;
              }
              // if compilation sucessed  
              if ((!result.Errors.HasErrors) && (result.CompiledAssembly != null))
              {
                  var type = result.CompiledAssembly.GetType("KeyWord.ReplaceDW_");
                  try
                  {
                      if (type != null)
                      {
                          this._r = Activator.CreateInstance(type) as IReplaceDW;
                      }
                      this.Replace("x"); //预热
                      this.Replace("x"); //预热
                  }
                  catch (Exception ex)
                  {
                      Console.WriteLine(ex);
                  }
              }
          }
      }










