The following tests show how SgmlReader converts malformed HTML into valid XML. Note that extended characters may appear incorrectly since this page is generated on the fly using the HTML test file from GitHub.
<html> <body><span text /> </body> </html>After
<html>
<body>
<span text="text" />
</body>
</html>
<html> <body><span text="foo>bar"/> </body> </html>After
<html>
<body>
<span text="foo">bar"/>
</span>
</body>
</html>
<html> <body><span text="foo<bar"/> </body> </html>After
<html>
<body>
<span text="foo<bar" />
</body>
</html>
<html> <body> <tag>&test   blah blah</tag> </body> </html>After
<html>
<body>
<tag>&test blah blah</tag>
</body>
</html>
<html> <body> <tag>    blah blah</tag> </body> </html>After
<html>
<body>
<tag> blah blah</tag>
</body>
</html>
<html> <body> <p>bad char: <span>􀀀</span></p> </body> </html>After
<html>
<body>
<p>bad char: <span></span></p>
</body>
</html>
<html> <body> <P class=MsoNormal dir=ltr style="MARGIN: 0pt;" align=left><?xml:namespace prefix = st1 ns = "urn:schemas-microsoft-com:office:smarttags" /><ST1:PERSONNAME></ST1:PERSONNAME></P> </body> </html>After
<html>
<body>
<P class="MsoNormal" dir="ltr" style="MARGIN: 0pt;" align="left">
<?namespace
prefix = st1 ns = "urn:schemas-microsoft-com:office:smarttags"
?>
<ST1:PERSONNAME xmlns:ST1="#unknown">
</ST1:PERSONNAME>
</P>
</body>
</html>
<html> <body> <DIV STYLE="top:214px; left:139px; position:absolute; font-size:26px;"><NOBR><SPAN STYLE="font-family:"Wingdings 2";"></SPAN></NOBR></DIV> </body> </html>After
<html>
<body>
<DIV STYLE="top:214px; left:139px; position:absolute; font-size:26px;">
<NOBR>
<SPAN STYLE="font-family:" Wingdings="Wingdings" _x0032_=";">
</SPAN>
</NOBR>
</DIV>
</body>
</html>
<html> <body> <script type="text/javascript">/*<![CDATA[*/ var test = '<div>"test"</div>'; /*]]>*/</script> <p>test</p> </body> </html>After
<html>
<body>
<script type="text/javascript"><![CDATA[
var test = '<div>"test"</div>';
]]></script>
<p>test</p>
</body>
</html>
<html> <body>This <P>is bad </P> XHTML.</body> </html>After
<html> <body>This <p>is bad </p> XHTML.</body> </html>
<html> <body><span>some text</span> <span>more text</span></body> </html>After
<html> <body><span>some text</span> <span>more text</span></body> </html>
<html> <body><a href="http://www.cnn.com/"' title="cnn.com">cnn</a></body> </html>After
<html>
<body>
<a href="http://www.cnn.com/">cnn</a>
</body>
</html>
<html> <head> <style> <!-- </style> </head> </html>After
<html>
<head>
<style>
<!--
</style>
</head>
</html>
-->
</style>
</head>
</html>
<html> <body>'</body> </html>After
<html> <body>'</body> </html>
<script type="text/javascript></script>After
<html> <script type="text/javascript"> </script> </html>
<html xmlns="http://www.w3.org/1999/xhtml"><head /><body><table u1:str="" x:str=""></table></body></html>After
<html xmlns="http://www.w3.org/1999/xhtml">
<head />
<body>
<table u1:str="" x:str="" xmlns:x="#unknown1" xmlns:u1="#unknown">
</table>
</body>
</html>
<html>
<body>²</body>
</html>
After<html> <body>²</body> </html>
<html>
<body>
<something@something.com>
</body>
</html>
After<html> <body><something@something.com></body> </html>
<html>
<body>
<script type="text/javascript">/*<![CDATA[*/ /*<![CDATA[*/ test /*]]>*/ /*]]>*/</script>
</body>
</html>
After
<html>
<body>
<script type="text/javascript"><![CDATA[ test /*]]>*/]]></script>
</body>
</html>
<html>
<body>
<style>div.wiki { float: right; }</style>
<em>foo</em>
</body>
</html>
After
<html>
<body>
<style><![CDATA[div.wiki { float: right; }]]></style>
<em>foo</em>
</body>
</html>
<html><body><title>Title</title><foo>foo</foo></body></html>After
<html>
<body>
<title>Title</title>
<foo>foo</foo>
</body>
</html>
<html><body> <p class="MsoNormal"> <span style="font-size: 10pt;" arial="" ,="" sans-serif="" ;;="" font-family:dummy:="" font-family:="" font-family:foo:="" arial;="" font-size:="" 13.3333px;=""> <span class="Apple-style-span" style="font-family: Arial; font-size: 13.3333px;">-lm</span> </span> </p> </body></html>After
<html>
<body>
<p class="MsoNormal">
<span style="font-size: 10pt;" arial="" sans-serif="">
<span class="Apple-style-span" style="font-family: Arial; font-size: 13.3333px;">-lm</span>
</span>
</p>
</body>
</html>
<html><body>do <![if !supportLists]>not<![endif]> lose this text</body></html>After
<html> <body>do not lose this text</body> </html>
<html xmlns="http://implicit" xmlns:n="http://explicit"><foo attr1="1" n:attr2="2" /><n:foo attr1="1" n:attr2="2" /></html>After
<html xmlns="http://implicit" xmlns:n="http://explicit"> <foo attr1="1" n:attr2="2" /> <n:foo attr1="1" n:attr2="2" /> </html>
<html xmlns:n="http://explicit"><foo attr1="1" n:attr2="2" /><n:foo attr1="1" n:attr2="2" /></html>After
<html xmlns:n="http://explicit"> <foo attr1="1" n:attr2="2" /> <n:foo attr1="1" n:attr2="2" /> </html>
<html xmlns:n="http://explicit"><foo attr1="1" n:attr2="2" /><n:foo attr1="1" n:attr2="2" /></html>After
<html xmlns:n="http://explicit"> <foo attr1="1" n:attr2="2" /> <n:foo attr1="1" n:attr2="2" /> </html>
<html><foo xmlns:n="http://explicit" attr1="1" n:attr2="2" /></html>After
<html> <foo xmlns:n="http://explicit" attr1="1" n:attr2="2" /> </html>
<html><foo xmlns:n="http://explicit" attr1="1" n:attr2="2" /></html>After
<html> <foo xmlns:n="http://explicit" attr1="1" n:attr2="2" /> </html>
<html xmlns:o="http://microsoft.com"><body>A<o:p></o:p>B<o:p></o:p></body></html>After
<html xmlns:o="http://microsoft.com"> <body>A<o:p></o:p>B<o:p></o:p></body> </html>
<html xmlns:o="http://microsoft.com"><body>A<o:p></o:p>B<o:p></o:p></body></html>After
<html xmlns:o="http://microsoft.com"> <body>A<o:p />B<o:p /></body> </html>
<html><body>A<o:p></o:p>B<o:p></o:p></body></html>After
<html> <body>A<o:p xmlns:o="#unknown"></o:p>B<o:p xmlns:o="#unknown"></o:p></body> </html>
<html><body>A<o:p></o:p>B<o:p></o:p></body></html>After
<html> <body>A<o:p xmlns:o="#unknown" />B<o:p xmlns:o="#unknown" /></body> </html>
<html><body>After
<html> <body> </body> </html>
<html>After
<html> </html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html>After
<html> </html>
<html> <body> <table><tr><td>row1<tr><td>row2</td>After
<html>
<body>
<table>
<tr>
<td>row1</td>
</tr>
<tr>
<td>row2</td>
</tr>
</table>
</body>
</html>
<html> <head> <script language="JavaScript"> <!-- --></script> </head> <body> <p>hello</p> </body> </html>After
<html>
<head>
<script language="JavaScript">
<!--
-->
</script>
</head>
<body>
<p>hello</p>
</body>
</html>
<html> <![CDATA[this is a CDATA block with markup <table><tr><td> ]]> </html>After
<html><![CDATA[this is a CDATA block with markup <table><tr><td> ]]></html>
<p>This is really <messed_up.< p>.After
<html> <p>This is really <messed_up.>< p>. </messed_up.></p> </html>
<html><class="black">Text………</html>After
<html> <class>Text………</class> </html>
<p>©</p> <br/>After
<html> <p>©</p> <br /> </html>
<html> <img src="img.gif" height"4" width= 2 > </html>After
<html> <img src="img.gif" height="4" width="2" /> </html>
<html> <script><![CDATA[this is a test]]></script> </html>After
<html> <script><![CDATA[this is a test]]></script> </html>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" > <HTML></HTML>After
<html> </html>
<b>foo</b>After
<html> <b>foo</b> </html>
blah <b>foo</b>After
<html>blah <b>foo</b></html>
<!-- top --> <b>foo</b>After
<!-- top --> <html> <b>foo</b> </html>
<html> <body> <p>ZZZ test Z</p>After
<html>
<body>
<p>ZZZ test Z</p>
</body>
</html>
<html> <?xml version="1.0" encoding="UTF-16"?> </html>After
<html> </html>
<html><?xml:namespace prefix="st1" ns="urn:schemas-microsoft-com:office:smarttags" /> <body>After
<html> <?namespace prefix="st1" ns="urn:schemas-microsoft-com:office:smarttags" ?> <body> </body> </html>
<html xmlns:portal="http://schemas.microsoft.com/msn/portal/controls"><head><title>Welcome to MSN.com</title>After
<html xmlns:portal="http://schemas.microsoft.com/msn/portal/controls">
<head>
<title>Welcome to MSN.com</title>
</head>
</html>
<html xmlns:portal="http://schemas.microsoft.com/msn/portal/controls"><head><title>Welcome to MSN.com</title>After
<html xmlns:portal="http://schemas.microsoft.com/msn/portal/controls">
<head>
<title>Welcome to MSN.com</title>
</head>
</html>
| File | Version | Size | Modified | |
|---|---|---|---|---|
| ||||
| Images 0 | ||
|---|---|---|
| No images to display in the gallery. |
Copyright © 2011 MindTouch, Inc. Powered by
- Regin
Thanks