Interestingly; there are loads of tag soup parsing behaviours between browsers and tools like tidy.exe
<body> <p>This is a sample test document.</p> a <em> b <address> c </em> d </address> e </body>
Win IE6 actually parses a cyclic graph (translate: not a tree!) out of this (which is both clever and ridiculous at the same time!):
Firefox and tidy.exe stay acyclic
Basically the common tree parsing behaviour is to late-close tags. I didn’t like that behaviour because it can result in massive nested structures (well it does on Legislative documents!)
The behaviour of this component is to omit the EM tags entirely. Download: Content/tagsoup/Files/after.xml.txt Content/tagsoup/Files/before.html.txt Content/tagsoup/Files/SoupNamespaceRemover.cs.txt Content/tagsoup/Files/TagSoupParser.cs.txt Content/tagsoup/Files/TidyWrapper.cs.txt
using System;
using System.Collections.Generic;
using System.Text;
using System.Query;
using System.Text.RegularExpressions;
namespace Legislation.Core.DocumentModel.Parsing
{
public static class TagSoupParser
{
static Regex _tagScope = new Regex( @"(?<TagContents><[^>]+?>)(?<Contents>[^<]*)(?=[<$])?", RegexOptions.Singleline );
/// <summary>
/// trs@260308
/// the problem here is that tidy.exe's behaviour is to late-close tag soup rather than
/// early-close-ommit. The former behaviour often results in massive nested structures
///
/// [SAMPLE]
/// <c>
/// <b> [WILL BE DELETED IF FOUND]
/// <a>.......
/// ........
/// </b> [WILL BE DELETED]
/// ..........
/// .........</a>
/// </c>
/// [/SAMPLE]
///
/// [BEHAVIOUREXAMPLE]
/// 01) 0H1
/// 02) SO1 (H1)
/// 03) 0C1 (H1, SO1)
/// 04) 0S1 (H1, SO1, C1)
/// 05) 0C1 (H1, SO1, C1, S1)
/// 06) 0C2 (H1, SO1, C1, S1, C1)
/// 07) 0C1 (H1, SO1, C1, S1)
/// 08) 0C2 (H1, SO1, C1, S1, C1)
/// 09) 0C2 (H1, SO1, C1, *S1*) ERROR!!!
/// 10) 0S2 (H1, SO1, S1)
/// 11) S02 (H1, SO1)
/// 12) 0H2 (H1)
/// [/BEHAVIOUREXAMPLE]
/// </summary>
/// <param name="soup"></param>
/// <returns></returns>
public static string FixTagSoup( string soup ) {
///capture
var scopes = _tagScope.Matches(soup).OfType<Match>().Select(m => new TagScope
{
TagName = m.Groups["TagContents"].Value.Split(' ')[0].Replace("/", "").Replace(">", "").Replace("<", ""),
Contents = m.Groups["Contents"].Value,
TagContents = m.Groups["TagContents"].Value,
IsEndTag = m.Groups["TagContents"].Value.Contains("</"),
BlackList=false,
SelfContaining = m.Groups["TagContents"].Value.Contains("/>")
|| m.Groups["TagContents"].Value.Contains("<!")
|| m.Groups["TagContents"].Value.Contains("<?")
}).ToList(); ///amazingly you need ToList; or the BlackList values get late-projected over the top!
///track the currently open tags
LinkedList<TagScope> openers = new LinkedList<TagScope>();
///scan and reconstruct
foreach ( TagScope scope in scopes.Where( s => !s.SelfContaining ) )
{
TagScope lastOpener = null;
///count is an O(1) operation on .NET linkedlist
if ( openers.Count > 0 ) lastOpener = openers.First.Value;
if (lastOpener != null && scope.IsEndTag)
{
if (lastOpener.TagName != scope.TagName)
{
scope.BlackList = true;
///now go back and kill off a matching opener if one exists
BacktrackDelete(openers, scope);
}
else {
///pop it off the beginning
openers.RemoveFirst();
}
}
if (!scope.IsEndTag) openers.AddFirst(scope);
}
///one more pass on the list
return RenderFinalDocument(scopes, openers);
}
private static string RenderFinalDocument(List<TagScope> scopes, LinkedList<TagScope> openers)
{
///where we reconstruct the final HTML
StringBuilder sb = new StringBuilder();
foreach (TagScope scope in scopes)
{
///we ommit this if its a closing soup tag
if (!scope.BlackList)
{
sb.Append(scope.TagContents);
}
///we always add this
sb.Append(scope.Contents);
}
CloseOffRemainingTags(openers, sb);
return sb.ToString();
}
/// <summary>
/// another scenario is tags that simply never get closed
/// tidy would have fixed this for us, but if we can do it here; it does no harm
/// means that we should have a good hierachial tag structure after this component has run
/// </summary>
/// <param name="openers"></param>
/// <param name="sb"></param>
private static void CloseOffRemainingTags(LinkedList<TagScope> openers, StringBuilder sb)
{
///whoops! we have some open tags that still need closing off...
foreach (TagScope scope in openers)
{
sb.Append(string.Format("</{0}>", scope.TagName));
}
}
/// <summary>
/// go back through the list and find the matching opening tag
/// then delete it - this method is being called because the opening tag
/// wasn't in the correct position
/// </summary>
/// <param name="openers"></param>
/// <param name="scope"></param>
private static void BacktrackDelete( LinkedList<TagScope> openers, TagScope scope )
{
for( LinkedListNode<TagScope> node = openers.First;
node.Next != null;
node = node.Next ) {
if (node.Value.TagName == scope.TagName)
{
openers.Remove( node );
node.Value.BlackList = true;
break;
}
}
}
public class TagScope
{
public string TagName;
public string Contents;
public string TagContents;
public bool IsEndTag;
public bool BlackList;
public bool SelfContaining;
}
}
}
|
