NET爬虫技术之.NET抓取Web网页数据分析实例
时间:2010-08-27 来源:会编程的星矢
以下是程序具体实现过程:
private void button1_Click(object sender, EventArgs e)
{
//要抓取的URL地址
string Url = "http://list.mp3.baidu.com/topso/mp3topsong.html?id=1#top2";
//得到指定Url的源码
string strWebContent = GetWebContent(Url);
richTextBox1.Text = strWebContent;
//取出和数据有关的那段源码
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("歌曲TOP500", iBodyStart);
int iTableStart = strWebContent.IndexOf("<table", iStart);
int iTableEnd = strWebContent.IndexOf("</table>", iTableStart);
string strWeb = strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8);
//生成HtmlDocument
WebBrowser webb = new WebBrowser();
webb.Navigate("about:blank");
HtmlDocument htmldoc = webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR");
foreach (HtmlElement tr in htmlTR)
{
string strID = tr.GetElementsByTagName("TD")[0].InnerText;
string strName = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "MusicName");
string strSinger = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "Singer");
strID = strID.Replace(".", "");
//插入DataTable
AddLine(strID, strName, strSinger,"0");
string strID1 = tr.GetElementsByTagName("TD")[2].InnerText;
string strName1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "MusicName");
string strSinger1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "Singer");
//插入DataTable
strID1 = strID1.Replace(".", "");
AddLine(strID1, strName1, strSinger1,"0");
string strID2 = tr.GetElementsByTagName("TD")[4].InnerText;
string strName2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "MusicName");
string strSinger2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "Singer");
//插入DataTable
strID2 = strID2.Replace(".", "");
AddLine(strID2, strName2, strSinger2,"0");
}
//插入数据库
InsertData(dt);
dataGridView1.DataSource = dt.DefaultView;
}
程序运行结果界面图:
文章来自学IT网:http://www.xueit.com/asp.net/show-4681-2.aspx