Experimentation‎ > ‎

Tag Soup Parser/Fixer

Interestingly; there are loads of tag soup parsing behaviours between browsers and tools like tidy.exe

 

<body>

  <p>This is a sample test document.</p>

  a <em> b <address> c </em> d </address> e

 </body>

 

Win IE6 actually parses a cyclic graph (translate: not a tree!) out of this (which is both clever and ridiculous at the same time!):

 

The BODY element has five children: P, a, EM, d, and   e. EM has two children, b and ADDRESS. ADDRESS has two children: c and d. P, a, EM, and d are siblings, b,   ADDRESS, and e are siblings. c has no siblings. d is the child of two nodes (BODY and ADDRESS), but considers   ADDRESS to be its parent.

 

Firefox and tidy.exe stay acyclic

The BODY element has five children: P, a, EM, d, and   e. EM has two children, b and ADDRESS. ADDRESS has two children: c and d. P, a, EM, and d are siblings, b,   ADDRESS, and e are siblings. c has no siblings. d is the child of two nodes (BODY and ADDRESS), but considers   ADDRESS to be its parent.

 

Basically the common tree parsing behaviour is to late-close tags. I didn’t like that behaviour because it can result in massive nested structures (well it does on Legislative documents!)

 

The behaviour of this component is to omit the EM tags entirely.

 Download:

Content/tagsoup/Files/after.xml.txt

Content/tagsoup/Files/before.html.txt

Content/tagsoup/Files/SoupNamespaceRemover.cs.txt

Content/tagsoup/Files/TagSoupParser.cs.txt

Content/tagsoup/Files/TidyWrapper.cs.txt

 

using System;
using System.Collections.Generic;
using System.Text;
using System.Query;
using System.Text.RegularExpressions;

namespace Legislation.Core.DocumentModel.Parsing
{
	public static class TagSoupParser
	{
		static Regex _tagScope = new Regex( @"(?<TagContents><[^>]+?>)(?<Contents>[^<]*)(?=[<$])?", RegexOptions.Singleline );
	
		/// <summary>
		/// trs@260308
		/// the problem here is that tidy.exe's behaviour is to late-close tag soup rather than
		/// early-close-ommit. The former behaviour often results in massive nested structures
		/// 
		/// [SAMPLE]
		/// <c>
		/// <b> [WILL BE DELETED IF FOUND]
		/// <a>.......
		/// ........
		/// </b> [WILL BE DELETED]
		/// ..........
		/// .........</a>
		/// </c>
		/// [/SAMPLE]
		/// 
		/// [BEHAVIOUREXAMPLE]
		/// 01) 0H1
		/// 02) SO1 (H1)
		/// 03) 0C1 (H1, SO1)
		/// 04) 0S1 (H1, SO1, C1)
		/// 05) 0C1 (H1, SO1, C1, S1)
		/// 06) 0C2 (H1, SO1, C1, S1, C1)
		/// 07) 0C1 (H1, SO1, C1, S1)
		/// 08) 0C2 (H1, SO1, C1, S1, C1)
		/// 09) 0C2 (H1, SO1, C1, *S1*)  ERROR!!!
		/// 10) 0S2 (H1, SO1, S1)
		/// 11) S02 (H1, SO1)
		/// 12) 0H2 (H1)
		/// [/BEHAVIOUREXAMPLE]
		/// </summary>
		/// <param name="soup"></param>
		/// <returns></returns>
		public static string FixTagSoup( string soup ) {

			///capture
			var scopes = _tagScope.Matches(soup).OfType<Match>().Select(m => new TagScope
										{
											TagName = m.Groups["TagContents"].Value.Split(' ')[0].Replace("/", "").Replace(">", "").Replace("<", ""),
											Contents = m.Groups["Contents"].Value,
											TagContents = m.Groups["TagContents"].Value,
											IsEndTag = m.Groups["TagContents"].Value.Contains("</"),
											BlackList=false,
											SelfContaining = m.Groups["TagContents"].Value.Contains("/>") 
																|| m.Groups["TagContents"].Value.Contains("<!")
																|| m.Groups["TagContents"].Value.Contains("<?")

										}).ToList(); ///amazingly you need ToList; or the BlackList values get late-projected over the top!

			///track the currently open tags
			LinkedList<TagScope> openers = new LinkedList<TagScope>();
			
			///scan and reconstruct
			foreach ( TagScope scope in scopes.Where( s => !s.SelfContaining ) )
			{
				TagScope lastOpener = null;

				///count is an O(1) operation on .NET linkedlist
				if ( openers.Count > 0 ) lastOpener = openers.First.Value;

				if (lastOpener != null && scope.IsEndTag)
				{
					if (lastOpener.TagName != scope.TagName)
					{
						scope.BlackList = true;
						
						///now go back and kill off a matching opener if one exists
						BacktrackDelete(openers, scope);
					}

					else {
					
						///pop it off the beginning
						openers.RemoveFirst();
					}
				}

				if (!scope.IsEndTag) openers.AddFirst(scope);
			}
			
			///one more pass on the list
			return RenderFinalDocument(scopes, openers);
		}

		private static string RenderFinalDocument(List<TagScope> scopes, LinkedList<TagScope> openers)
		{
			///where we reconstruct the final HTML
			StringBuilder sb = new StringBuilder();
		
			foreach (TagScope scope in scopes)
			{
				///we ommit this if its a closing soup tag
				if (!scope.BlackList)
				{
					sb.Append(scope.TagContents);
				}

				///we always add this
				sb.Append(scope.Contents);
			}

			CloseOffRemainingTags(openers, sb);
			
			return sb.ToString();
		}

		/// <summary>
		/// another scenario is tags that simply never get closed
		/// tidy would have fixed this for us, but if we can do it here; it does no harm
		/// means that we should have a good hierachial tag structure after this component has run
		/// </summary>
		/// <param name="openers"></param>
		/// <param name="sb"></param>
		private static void CloseOffRemainingTags(LinkedList<TagScope> openers, StringBuilder sb)
		{
			///whoops! we have some open tags that still need closing off...
			foreach (TagScope scope in openers)
			{
				sb.Append(string.Format("</{0}>", scope.TagName));
			}
		}

		/// <summary>
		/// go back through the list and find the matching opening tag
		/// then delete it - this method is being called because the opening tag
		/// wasn't in the correct position
		/// </summary>
		/// <param name="openers"></param>
		/// <param name="scope"></param>
		private static void BacktrackDelete( LinkedList<TagScope> openers, TagScope scope )
		{
			for( LinkedListNode<TagScope> node = openers.First; 
					node.Next != null;
					node = node.Next ) {
			
				if (node.Value.TagName == scope.TagName)
				{
					openers.Remove( node );
					
					node.Value.BlackList = true;
					
					break;
				}
			}
		}
		
		public class TagScope
		{
			public string TagName;
			public string Contents;
			public string TagContents;
			public bool IsEndTag;
			public bool BlackList;
			public bool SelfContaining;
		}
	}
	
	
}