标记语言处理模型（演示版）

时间：2007-02-17 来源：PHP爱好者

这是一个模仿html，xml语言从文本转化为对象这一过程的模型，并降低了一些如tagName不能修改，包围标记只能是“<”、“>”等的限制，尽可能的扩大对文本的自由处理。

通过这个模型也就可以制作出如 html 和 ubb 的双向转换程序，Internet Explorer中显示XML文档一样有hightLight和折叠功能的视图等一系列的关于标记语言的实例程序。

标记语言处理模型
<TextArea id="code" rows=15 cols=100>[b color=#FF0000]aaa<B>bbb</B><B>ccc</B>eeee<B>ddd</B></TextArea><br /><h3>属性</h3><button onclick="alert(firstchildren.tagName)">tagName</button><button onclick="alert(firstchildren.childNodes)">childNodes</button><button onclick="alert(firstchildren.attributes)">attributes</button><hr /><h3>方法</h3><button onclick="alert(firstchildren.getOuterHTML())">getOuterHTML</button><button onclick="alert(firstchildren.getInnerHTML())">getInnerHTML</button><button onclick="alert(firstchildren.getAttributeNode('color').value)">getAttributeNode</button><button onclick="alert(firstchildren.getChildren())">getChildren</button><button onclick="alert(firstchildren.all())">all</button><hr /><h3>实例</h3><script>function ubb2html(){var ubbDocument= MarkupLanguageDocument(code.value, "[", "]")var allObject= ubbDocument.all()for (var i=0; i<allObject.length; i++){if (allObject.tagName == "b"){allObject.lt= "<"allObject.gt= ">"}}alert(ubbDocument.getOuterHTML())}</script><button onclick="ubb2html()">ubb2html</button><xmp style="background-color: #EEEEEE; padding: 10px;">function ubb2html(){var ubbDocument= MarkupLanguageDocument(code.value, "[", "]")var allObject= ubbDocument.all()for (var i=0; i<allObject.length; i++){if (allObject.tagName == "b"){allObject.lt= "<"allObject.gt= ">"}}alert(ubbDocument.getOuterHTML())}</xmp><script>String.prototype.getAbsReg= function(s){var absStr= this.match(/(______rela:-?d+______|.[^]*|[^]*)/g)while (absStr.length-1){if (/^______rela:-?d+______$/.test(absStr[1]))absStr[1]= "" + (parseInt(absStr[1].match(/-?d+/g)) + absStr[0].match(/(/g).length)absStr[0]= absStr.shift() + absStr[0]}return new RegExp(absStr.join(""), s)}RegExp.prototype.getRelaStr= function(){var regStr= this.sourcevar relaStr= regStr.match(/(d+|.[^]*|[^]*)/g)while (relaStr.length-1){if (/^d+$/.test(relaStr[1]))relaStr[1]= "______rela:" + (parseInt(relaStr[1].match(/d+/g)) - relaStr[0].match(/(/g).length) + "______"relaStr[0]= relaStr.shift() + relaStr[0]}return relaStr.join("")}</script><script>function MarkupLanguageDocument(html, lt, gt){function htmlElement(tagName){this.tagName= tagNamethis.childNodes= new Array()this.attributes= new Array()this.setInnerHTML= function(html, lt, gt){this.childNodes= parseHTML(html, lt, gt)returnthis.childNodes}this.getInnerHTML= function (){var childrenHTML= ""for (var i=0; i<this.childNodes.length; i++){childrenHTML += (this.constructor == this.childNodes.constructor) ? this.childNodes.getOuterHTML() : this.childNodes}return childrenHTML}this.getOuterHTML= function(){var tagName= this.tagNamevar childrenHTML= this.getInnerHTML()var attrHTML= ""for (var i=0; i<this.attributes.length; i++){attrHTML+= (this.attributes.name + (this.attributes.value ? "="" + this.attributes.value + "" " : " "))}return tagName ? this.lt + tagName + (attrHTML ? " "+attrHTML : "") + (childrenHTML ? this.gt + childrenHTML + this.lt + "/" + tagName + this.gt : "/" + this.gt) : childrenHTML}this.getAttributeNode= function(name){for (var i=0; i<this.attributes.length; i++){if (this.attributes.name == name)return this.attributes}return null}this.getChildren= function(){var childrenArray= new Array()for (var i=0; i<this.childNodes.length; i++){if (this.constructor == this.childNodes.constructor)childrenArray[childrenArray.length] = this.childNodes}return childrenArray}this.all= function(){var allArray= new Array()var children= this.getChildren()for (var i=0; i<children.length; i++){allArray[allArray.length]= childrenif (children.getChildren()){allArray = allArray.concat(children.all())}}return allArray}}function attributeObj(name, value){this.name= namethis.value= value}function parseHTML(html, lt, gt){var lt= lt ? lt.charAt(0) : "<"var gt= gt ? gt.charAt(0) : ">"var lt1= lt.replace(/([[].])/, "$1")var gt1= gt.replace(/([[].])/, "$1")//字符串正则:如 "abc"var strRe= /(["'])(["'tbnr]|[^1])*?1///属性正则:如 <span attribute="abc"> 中的 attribute="abc"var attrRe= ("(w+)(s*=s*(" + strRe.getRelaStr() + "|[^s"+gt1+"]+)|.{0})").getAbsReg()//标签名正则:如 <span attribute="abc"> 中的 spanvar tagRe= /((w+:)?w+)///标识单Target正则:如 <span /> 中的 /var sTagRe= new RegExp("/s*(?=" + gt1 + ")")//标识Target关闭正则:如 </span> 中的 /var cTagRe= //s*///对象正则:如 <span attribute="abc" />var objRe= (lt1 + "(" + cTagRe.getRelaStr() + ")?" + tagRe.getRelaStr() + "(" + attrRe.getRelaStr() + "|s)*?(" + sTagRe.getRelaStr() + ")?" + gt1).getAbsReg()//节点正则:如 this is a <img src="http://www.okasp.com/techinfo/simple.jpg" /> 中的 this is a 和 <img src="http://www.okasp.com/techinfo/simple.jpg" />var nodeRe= ("(" + objRe.getRelaStr() + "|[^" + lt1 + "]*)").getAbsReg()var htmlArray= new Array()var node= ""var tmpObj= nullfunction getfisrtNode(html){return new function(){this.nodeValue= nodeRe.test(html)? html.match(nodeRe)[0] : ""this.otherHTML= html.substr(this.nodeValue.length, html.length)this.tagName= tagRe.test(this.nodeValue)? this.nodeValue.match(tagRe)[0] : undefinedvar tmpAttrStr= this.nodeValue.replace(tagRe, "")var tmpArrtRe= new RegExp(attrRe.source, "g")this.attrs= tmpArrtRe.test(tmpAttrStr)? tmpAttrStr.match(tmpArrtRe) : new Array()}}while(html){tmpNode= getfisrtNode(html)node= tmpNode.nodeValuehtml= tmpNode.otherHTMLif (objRe.test(node)){var tagName= tmpNode.tagNamevar childrenhtml= ""var j= sTagRe.test(node) ? 0 : 1var attrsObj= new Array()node= new htmlElement(tagName)for (var i=0; i<tmpNode.attrs.length; i++){var tmpAr= tmpNode.attrs.match(attrRe)attrsObj= new attributeObj(tmpAr[1], tmpAr[3].replace(/^(["'])([.sS]*?)1$/, "$2"))}node.attributes= attrsObjnode.lt= ltnode.gt= gtwhile (j && html){var tmpNode= getfisrtNode(html)html= tmpNode.otherHTMLif (tmpNode.tagName == tagName && !(sTagRe.test(tmpNode.nodeValue))){("^" + lt1 + cTagRe.getRelaStr()).getAbsReg().test(tmpNode.nodeValue) ? j-- : j++}if (j)childrenhtml+=tmpNode.nodeValue}node.childNodes= parseHTML(childrenhtml, lt, gt)}htmlArray[htmlArray.length]= node}return htmlArray}var mlDocument= new htmlElement()mlDocument.childNodes= parseHTML(html, lt, gt)return mlDocument}var parseDocument= MarkupLanguageDocument(code.value, "[", "]")var firstchildren= parseDocument.childNodes[1]</script>
[Ctrl+A 全部选择提示：你可先修改部分代码，再按运行]

这个程序的处理方法大致是通过正则表达式的拼接（getAbsReg 和 getRelaStr 这两个自定义函数），从字符串 → 属性 → 对象 → 节点一级级的拼接出他们的正则表达式，然后再从节点开始反向的从文本流中提取对应的内容，再将他们设置到 htmlElement 类的相应的属性上，也就是看到了 [object object]。

最后实例程序通过设置和使用 htmlElement 中提供属性和方法，将文本的每一个细节修改到最终所需要的状态，最后使用getOutetHTML或是getInnerHTML方法得到处理之后的文本流。

另外，用这种方法处理还可以达到标记自动补全的功能。（出处：蓝色理想）
php爱好者站 http://www.phpfans.net 网页制作|网站建设|数据采集.