TF-IDF(C#)
时间:2010-12-29 来源:live41
ps: codeproject.com有一个泰国仔的版本,代码写得非常乱..

using System.Collections.Generic;
using System.Text;
namespace Cluster
{
/// <summary>
/// term frequency–inverse document frequency
/// </summary>
static class TFIDF
{
/// <summary>
/// 计算tf-idf
/// </summary>
/// <param name="docs">待处理文档(已分词)</param>
/// <returns></returns>
public static List<Dictionary<int, double>> Calculate(string[][] docs)
{
List<Dictionary<int, double>> tfidfs = new List<Dictionary<int, double>>();
Dictionary<string, Term> terms = new Dictionary<string, Term>(); //词表
List<Dictionary<int, double>> tfs = new List<Dictionary<int, double>>(); //词频
Dictionary<int, double> idfs = new Dictionary<int, double>(); //逆文档频率
CalcTF(docs, terms, tfs);
CalcIDF(docs, terms, idfs);
CalcTFIDF(tfs, idfs, tfidfs);
return tfidfs;
}
#region TF
/// <summary>
/// 计算词频(term frequency)
/// </summary>
/// <param name="docs">文档</param>
/// <param name="terms">词表</param>
/// <param name="tfs">词数</param>
private static void CalcTF(string[][] docs, Dictionary<string, Term> terms, List<Dictionary<int, double>> tfs)
{
foreach (string[] doc in docs)
{
Dictionary<int, int> termNums = new Dictionary<int, int>();
foreach (string term in doc)
{
int index = -1; //词表索引
if (!terms.ContainsKey(term))
{
index = terms.Count;
terms.Add(term, new Term(index));
}
else
{
index = terms[term].index;
}
if (!termNums.ContainsKey(index))
{
termNums.Add(index, 1);
terms[term].docNum++; //词的文档数
}
else
{
termNums[index]++;
}
}
double len = (double)doc.Length;
Dictionary<int, double> tf = new Dictionary<int, double>(); //词频
foreach (KeyValuePair<int, int> kvp in termNums)
{
tf.Add(kvp.Key, (double)kvp.Value / len); //当前词的词数/总词数
}
tfs.Add(tf);
}
}
#endregion
#region IDF
/// <summary>
/// 计算逆文档频率(inverse document frequency)
/// </summary>
/// <param name="docs"></param>
/// <param name="terms"></param>
/// <param name="idfs"></param>
private static void CalcIDF(string[][] docs, Dictionary<string, Term> terms, Dictionary<int, double> idfs)
{
double len = (double)docs.Length;
foreach (KeyValuePair<string, Term> kvp in terms)
{
double idf = Math.Log(len / (double)kvp.Value.docNum, Math.E); //ln(总文档数/当前词出现过的文档数)
idfs.Add(kvp.Value.index, idf);
}
}
#endregion
#region TF-IDF
/// <summary>
///
/// </summary>
/// <param name="tfs"></param>
/// <param name="idfs"></param>
/// <param name="tfidfs"></param>
private static void CalcTFIDF(List<Dictionary<int, double>> tfs, Dictionary<int, double> idfs, List<Dictionary<int, double>> tfidfs)
{
foreach (Dictionary<int, double> tf in tfs)
{
Dictionary<int, double> tfidf = new Dictionary<int, double>();
foreach (KeyValuePair<int, double> kvp in tf)
{
tfidf.Add(kvp.Key, kvp.Value * idfs[kvp.Key]);
}
tfidfs.Add(tfidf);
}
}
#endregion
}
}
相关阅读 更多 +
排行榜 更多 +