文章详情

  • 游戏榜单
  • 软件榜单
关闭导航
热搜榜
热门下载
热门标签
php爱好者> php文档>TF-IDF(C#)

TF-IDF(C#)

时间:2010-12-29  来源:live41

 

ps: codeproject.com有一个泰国仔的版本,代码写得非常乱..

 

代码 using System;
using System.Collections.Generic;
using System.Text;

namespace Cluster
{
    /// <summary>
    /// term frequency–inverse document frequency
    /// </summary>
    static class TFIDF
    {
        /// <summary>
        /// 计算tf-idf
        /// </summary>
        /// <param name="docs">待处理文档(已分词)</param>
        /// <returns></returns>
        public static List<Dictionary<int, double>> Calculate(string[][] docs)
        {
            List<Dictionary<int, double>> tfidfs = new List<Dictionary<int, double>>();

            Dictionary<string, Term> terms = new Dictionary<string, Term>(); //词表
            List<Dictionary<int, double>> tfs = new List<Dictionary<int, double>>(); //词频
            Dictionary<int, double> idfs = new Dictionary<int, double>(); //逆文档频率

            CalcTF(docs, terms, tfs);
            CalcIDF(docs, terms, idfs);
            CalcTFIDF(tfs, idfs, tfidfs);

            return tfidfs;
        }

        #region TF
        /// <summary>
        /// 计算词频(term frequency)
        /// </summary>
        /// <param name="docs">文档</param>
        /// <param name="terms">词表</param>
        /// <param name="tfs">词数</param>
        private static void CalcTF(string[][] docs, Dictionary<string, Term> terms, List<Dictionary<int, double>> tfs)
        {
            foreach (string[] doc in docs)
            {
                Dictionary<int, int> termNums = new Dictionary<int, int>();
                foreach (string term in doc)
                {
                    int index = -1; //词表索引
                    if (!terms.ContainsKey(term))
                    {
                        index = terms.Count;
                        terms.Add(term, new Term(index));
                    }
                    else
                    {
                        index = terms[term].index;
                    }
                    if (!termNums.ContainsKey(index))
                    {
                        termNums.Add(index, 1);
                        terms[term].docNum++; //词的文档数
                    }
                    else
                    {
                        termNums[index]++;
                    }
                }
                double len = (double)doc.Length;
                Dictionary<int, double> tf = new Dictionary<int, double>(); //词频
                foreach (KeyValuePair<int, int> kvp in termNums)
                {
                    tf.Add(kvp.Key, (double)kvp.Value / len); //当前词的词数/总词数
                }
                tfs.Add(tf);
            }
        }
        #endregion

        #region IDF
        /// <summary>
        /// 计算逆文档频率(inverse document frequency)
        /// </summary>
        /// <param name="docs"></param>
        /// <param name="terms"></param>
        /// <param name="idfs"></param>
        private static void CalcIDF(string[][] docs, Dictionary<string, Term> terms, Dictionary<int, double> idfs)
        {
            double len = (double)docs.Length;
            foreach (KeyValuePair<string, Term> kvp in terms)
            {
                double idf = Math.Log(len / (double)kvp.Value.docNum, Math.E); //ln(总文档数/当前词出现过的文档数)
                idfs.Add(kvp.Value.index, idf);
            }
        }
        #endregion

        #region TF-IDF
        /// <summary>
        /// 
        /// </summary>
        /// <param name="tfs"></param>
        /// <param name="idfs"></param>
        /// <param name="tfidfs"></param>
        private static void CalcTFIDF(List<Dictionary<int, double>> tfs, Dictionary<int, double> idfs, List<Dictionary<int, double>> tfidfs)
        {
            foreach (Dictionary<int, double> tf in tfs)
            {
                Dictionary<int, double> tfidf = new Dictionary<int, double>();
                foreach (KeyValuePair<int, double> kvp in tf)
                {
                    tfidf.Add(kvp.Key, kvp.Value * idfs[kvp.Key]);
                }
                tfidfs.Add(tfidf);
            }
        }
        #endregion

    }
}

 

 

相关阅读 更多 +
排行榜 更多 +
白银之城手游官服下载

白银之城手游官服下载

角色扮演 下载
像素赛车手魔改版下载

像素赛车手魔改版下载

赛车竞速 下载
自由城计划翼豪陆神模组手机版下载

自由城计划翼豪陆神模组手机版下载

角色扮演 下载