Html convert to DOM Tree
时间:2010-09-23 来源:_Amo.Jry
添加引用 Microsoft.mshtml
并且把项目的属性中的非安全运行允许
但是感觉是不是 html内容不规则的时候, 会崩溃.
OMG, 是不是能够有更好的方式来解析这个html内容呢>>> 望见文者, 推荐推荐. 谢谢
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using mshtml;
using System.Runtime.InteropServices;
namespace Html2DOMTree
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
string html = "";
if (richTextBox1.Text != "")
{
html = richTextBox1.Text; //取得html源代码
}
IHTMLDocument2 doc2 = Parse(html);
IHTMLDocument3 HTMLDocument = (IHTMLDocument3)doc2; //doc2对象转换成HTMLDocument对象
IHTMLDOMNode rootDomNode = null;
rootDomNode = (IHTMLDOMNode)HTMLDocument.documentElement; //获取文档根部节点,也就是HTML节点
TreeNode root = treeView1.Nodes.Add("HTML"); ////加入跟节点
InsertDOMNodes(rootDomNode, root); //把domnode插入到跟节点中,调用InsertDOMNodes方法
}
//解析Dom树
unsafe IHTMLDocument2 Parse(string s) //unsafe关键字表示不安全上下文
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null)
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit; //as运算符类似于强制转换操作;如果转换不可行,as会返回null而不是引发异常。
pPersist.InitNew();
pPersist = null;
IMarkupServices ms = pDocument as IMarkupServices;
if (ms != null)
{
IMarkupContainer pMC = null;
IMarkupPointer pStart, pEnd;
ms.CreateMarkupPointer(out pStart);
ms.CreateMarkupPointer(out pEnd);
System.Text.StringBuilder sb = new System.Text.StringBuilder(s);
IntPtr pSource = Marshal.StringToHGlobalUni(s);
ms.ParseString(ref *(ushort*)pSource.ToPointer(), 0, out pMC, pStart, pEnd);
if (pMC != null)
{
Marshal.Release(pSource);
return pMC as IHTMLDocument2;
}
Marshal.Release(pSource);
}
}
return null;
}
//插入Dom树
public void InsertDOMNodes(IHTMLDOMNode parentnode, TreeNode tree_node)
{
if (parentnode.hasChildNodes())//是否有子结点
{
IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;
int length = allchild.length;
for (int i = 0; i < length; i++)//对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归
{
IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);
string m_snodeName = child_node.nodeName;
object m_onodevalue = child_node.nodeValue;
string m_snodetype = child_node.nodeType.ToString();
string m_snodevalue = "";
if (m_onodevalue != null)
m_snodevalue = m_onodevalue.ToString().Trim();
TreeNode tempnode = null;
if (child_node.nodeName.Equals("#text"))
{
if ((m_snodevalue != null) && (!m_snodevalue.Equals("")))
{
tempnode = tree_node.Nodes.Add(m_snodevalue);
}
}
else
{
tempnode = tree_node.Nodes.Add(child_node.nodeName);
InsertDOMNodes(child_node, tempnode);
}
}
}
}
}
[ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713"), InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In, Out] ref Guid pClassID);
[return: MarshalAs(UnmanagedType.I4)]
[PreserveSig]
int IsDirty();
void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
[In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
}