Read XHTML into XDocument
HTML can be well-formed xml (XHTML 1.0, XHTML 1.1 and HTML5 as XHTML), but xml parsers don't understand the HTML named entities (such as ). This is a class for reading well-formed XHTML - with or without a DOCTYPE - into a .net XDocument, as well as saving it, preserving the named entities.
See also reading XHTML into XmlDocument.
- Read in html string with static var doc = HtmlDocument.ParseHtml(html)
- Use standard XDocument selects/updates, with the namespace (doc.XHtml or for namespace manager doc.Ns with "html:" prefix, as applicable)
- Get the html with doc.WriteHtml() which automatically handles &, >, <, ", .
- If you want all HTML entities expanded as entities, use static html = HtmlDocument.EntitizeHtml(html)
- If it's not well formed HTML, use Html Agility Pack instead
Example use
//read the text
var html = File.ReadAllText(path);
//parse it into an XDocument
var doc = HtmlDocument.ParseHtml(html);
//always use the namespace (in HtmlDocument.XHtml)
var inputs = doc.Descendants(doc.XHtml + "input").First();
inputs.SetAttributeValue("autocomplete","off"); //change it
//use XPath with a namespace manager (html)
var footer = doc.XPathSelectElement("//html:footer", doc.Ns);
if (footer != null)
{
footer.SetValue("\xA9 MyCompany"); //©
}
//get the updated html string
html = doc.WriteHtml();
//fix other named entities (if required- not needed for >,<,", )
html = HtmlDocument.EntitizeHtml(html);
//save it
File.WriteAllText(path, html);
var html = File.ReadAllText(path);
//parse it into an XDocument
var doc = HtmlDocument.ParseHtml(html);
//always use the namespace (in HtmlDocument.XHtml)
var inputs = doc.Descendants(doc.XHtml + "input").First();
inputs.SetAttributeValue("autocomplete","off"); //change it
//use XPath with a namespace manager (html)
var footer = doc.XPathSelectElement("//html:footer", doc.Ns);
if (footer != null)
{
footer.SetValue("\xA9 MyCompany"); //©
}
//get the updated html string
html = doc.WriteHtml();
//fix other named entities (if required- not needed for >,<,", )
html = HtmlDocument.EntitizeHtml(html);
//save it
File.WriteAllText(path, html);
Code
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Xml;
using System.Xml.Linq;
namespace Library
{
/// <summary>An HTML page as a XDocument. Use <see cref="ParseHtml"/> to load html, <see cref="WriteHtml"/> to write</summary>
public class HtmlDocument : XDocument
{
/// <summary>
/// Initializes a new instance of the <see cref="HtmlDocument"/> class.
/// </summary>
/// <param name="other">The <see cref="T:System.Xml.Linq.XDocument" /> object that will be copied.</param>
public HtmlDocument(XDocument other)
: base(other)
{
}
/// <summary>
/// The namespace. Also select elements with this name (eg doc.XHtml + "body")
/// </summary>
public XNamespace XHtml { get; set; }
/// <summary>
/// Namespace manager used for XPath queries
/// </summary>
public XmlNamespaceManager Ns { get; internal set; }
/// <summary>
/// Convert a string of html into an XDocument. HTML must be well-formed xml, but could have any common Doctype including Html5 (or no DocType)
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static HtmlDocument ParseHtml(string html)
{
html = SanitizeHtml(html);
using (var reader = XmlReader.Create(new StringReader(html.Trim()), XmlReaderSettings))
{
var xDocument = Load(reader, LoadOptions.PreserveWhitespace);
var doc = new HtmlDocument(xDocument);
if (doc.Root != null)
{
//get the xmlns (maybe absent or not official html)
var ns = doc.Root.Name.Namespace;
doc.XHtml = ns;
}
if (reader.NameTable != null)
{
doc.Ns = new XmlNamespaceManager(reader.NameTable);
doc.Ns.AddNamespace("html", doc.XHtml.ToString());
}
//InternalSubset is an empty string and should be null (other dtd has "[]" at the end)
if (doc.DocumentType != null && string.IsNullOrEmpty(doc.DocumentType.InternalSubset))
{
doc.DocumentType.InternalSubset = null;
}
return doc;
}
}
static string SanitizeHtml(string html)
{
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");
html = html.Trim();
//look for a DocType
var start = html.IndexOf("<!DOCTYPE", StringComparison.OrdinalIgnoreCase);
//1. There is no DocType - add XHtml 1 Transitional
if (start == -1)
{
//assume no <?xml version=""1.0""?>
return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
<!-- HtmlDocument-NoDOCTYPE -->
" + html;
}
//2. There is an Html5 DocType - swap it for XHtml 1 Transitional
var end = html.IndexOf(">", start, StringComparison.OrdinalIgnoreCase);
if (end - start <= "<!DOCTYPE html>".Length)
{
return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
<!-- HtmlDocument-DOCTYPE html -->
" + html.Substring(end + 1);
}
//3. There is an other Html5 DocType - assume it's ok.
return html;
}
/// <summary>
/// Writes the HTML. Only xml named entities plus nbsp are explicitly written.
/// </summary>
/// <returns></returns>
public string WriteHtml()
{
var html = ToString();
//does it have our marker comment?
var startMarker = html.IndexOf("<!-- HtmlDocument-", StringComparison.OrdinalIgnoreCase);
if (startMarker != -1)
{
var endMarker = html.IndexOf(">", startMarker, StringComparison.OrdinalIgnoreCase) + 1;
var marker = html.Substring(startMarker, endMarker - startMarker);
//remove the temporary DTD and marker comment
html = html.Substring(endMarker).Trim();
if (string.Equals(marker, "<!-- HtmlDocument-DOCTYPE html -->", StringComparison.OrdinalIgnoreCase))
{
//reinsert the html5 doctype
html = "<!DOCTYPE html>\r\n" + html;
}
else if (string.Equals(marker, "<!-- HtmlDocument-NoDOCTYPE -->", StringComparison.OrdinalIgnoreCase))
{
//no doctype
}
}
//for non-breaking space only, show the named entity.
html = html.Replace("\xA0", " ");
//for other entities, use EntitizeHtml method
return html;
}
/// <summary>
/// Replaces resolved characters with the corresponding html named entity
/// </summary>
public static string EntitizeHtml(string html)
{
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");
var entities = LoadHtmlEntities();
foreach (var keyPair in entities)
{
var ch = (char)keyPair.Key;
html = html.Replace(ch.ToString(), "&" + keyPair.Value + ";");
}
return html;
}
static Dictionary<int, string> LoadHtmlEntities()
{
var entities = new Dictionary<int, string>();
using (
var stream =
System.Reflection.Assembly.GetExecutingAssembly()
.GetManifestResourceStream("Library.ParseXHtml.xhtml-entities.ent"))
{
if (stream == null) return entities;
using (var sr = new StreamReader(stream))
{
string line;
while ((line = sr.ReadLine()) != null)
{
if (!line.StartsWith("<!ENTITY", StringComparison.OrdinalIgnoreCase)) continue;
var q1 = line.IndexOf("\"&#", StringComparison.Ordinal);
var q2 = line.IndexOf(";\"", StringComparison.Ordinal);
if (q1 == -1 || q2 == -1) continue;
var ent = line.Substring(9, q1 - 11).Trim();
if (ent == "quot" || ent == "gt" || ent == "lt" || ent == "amp" || ent == "apos")
{
continue; //done automatically by the XmlWriter in ToString
}
var v = line.Substring(q1 + 3, q2 - q1 - 3);
var key = int.Parse(v, CultureInfo.InvariantCulture);
entities.Add(key, ent);
}
}
}
return entities;
}
static XmlReaderSettings XmlReaderSettings
{
get
{
var readerSettings = new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Parse,
XmlResolver = new HtmlResolver(),
//.net 4 includes the full XHtml 1.0 DTD
//XmlResolver = new XmlPreloadedResolver(XmlKnownDtds.Xhtml10),
};
return readerSettings;
}
}
}
}
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Xml;
using System.Xml.Linq;
namespace Library
{
/// <summary>An HTML page as a XDocument. Use <see cref="ParseHtml"/> to load html, <see cref="WriteHtml"/> to write</summary>
public class HtmlDocument : XDocument
{
/// <summary>
/// Initializes a new instance of the <see cref="HtmlDocument"/> class.
/// </summary>
/// <param name="other">The <see cref="T:System.Xml.Linq.XDocument" /> object that will be copied.</param>
public HtmlDocument(XDocument other)
: base(other)
{
}
/// <summary>
/// The namespace. Also select elements with this name (eg doc.XHtml + "body")
/// </summary>
public XNamespace XHtml { get; set; }
/// <summary>
/// Namespace manager used for XPath queries
/// </summary>
public XmlNamespaceManager Ns { get; internal set; }
/// <summary>
/// Convert a string of html into an XDocument. HTML must be well-formed xml, but could have any common Doctype including Html5 (or no DocType)
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static HtmlDocument ParseHtml(string html)
{
html = SanitizeHtml(html);
using (var reader = XmlReader.Create(new StringReader(html.Trim()), XmlReaderSettings))
{
var xDocument = Load(reader, LoadOptions.PreserveWhitespace);
var doc = new HtmlDocument(xDocument);
if (doc.Root != null)
{
//get the xmlns (maybe absent or not official html)
var ns = doc.Root.Name.Namespace;
doc.XHtml = ns;
}
if (reader.NameTable != null)
{
doc.Ns = new XmlNamespaceManager(reader.NameTable);
doc.Ns.AddNamespace("html", doc.XHtml.ToString());
}
//InternalSubset is an empty string and should be null (other dtd has "[]" at the end)
if (doc.DocumentType != null && string.IsNullOrEmpty(doc.DocumentType.InternalSubset))
{
doc.DocumentType.InternalSubset = null;
}
return doc;
}
}
static string SanitizeHtml(string html)
{
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");
html = html.Trim();
//look for a DocType
var start = html.IndexOf("<!DOCTYPE", StringComparison.OrdinalIgnoreCase);
//1. There is no DocType - add XHtml 1 Transitional
if (start == -1)
{
//assume no <?xml version=""1.0""?>
return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
<!-- HtmlDocument-NoDOCTYPE -->
" + html;
}
//2. There is an Html5 DocType - swap it for XHtml 1 Transitional
var end = html.IndexOf(">", start, StringComparison.OrdinalIgnoreCase);
if (end - start <= "<!DOCTYPE html>".Length)
{
return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">
<!-- HtmlDocument-DOCTYPE html -->
" + html.Substring(end + 1);
}
//3. There is an other Html5 DocType - assume it's ok.
return html;
}
/// <summary>
/// Writes the HTML. Only xml named entities plus nbsp are explicitly written.
/// </summary>
/// <returns></returns>
public string WriteHtml()
{
var html = ToString();
//does it have our marker comment?
var startMarker = html.IndexOf("<!-- HtmlDocument-", StringComparison.OrdinalIgnoreCase);
if (startMarker != -1)
{
var endMarker = html.IndexOf(">", startMarker, StringComparison.OrdinalIgnoreCase) + 1;
var marker = html.Substring(startMarker, endMarker - startMarker);
//remove the temporary DTD and marker comment
html = html.Substring(endMarker).Trim();
if (string.Equals(marker, "<!-- HtmlDocument-DOCTYPE html -->", StringComparison.OrdinalIgnoreCase))
{
//reinsert the html5 doctype
html = "<!DOCTYPE html>\r\n" + html;
}
else if (string.Equals(marker, "<!-- HtmlDocument-NoDOCTYPE -->", StringComparison.OrdinalIgnoreCase))
{
//no doctype
}
}
//for non-breaking space only, show the named entity.
html = html.Replace("\xA0", " ");
//for other entities, use EntitizeHtml method
return html;
}
/// <summary>
/// Replaces resolved characters with the corresponding html named entity
/// </summary>
public static string EntitizeHtml(string html)
{
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");
var entities = LoadHtmlEntities();
foreach (var keyPair in entities)
{
var ch = (char)keyPair.Key;
html = html.Replace(ch.ToString(), "&" + keyPair.Value + ";");
}
return html;
}
static Dictionary<int, string> LoadHtmlEntities()
{
var entities = new Dictionary<int, string>();
using (
var stream =
System.Reflection.Assembly.GetExecutingAssembly()
.GetManifestResourceStream("Library.ParseXHtml.xhtml-entities.ent"))
{
if (stream == null) return entities;
using (var sr = new StreamReader(stream))
{
string line;
while ((line = sr.ReadLine()) != null)
{
if (!line.StartsWith("<!ENTITY", StringComparison.OrdinalIgnoreCase)) continue;
var q1 = line.IndexOf("\"&#", StringComparison.Ordinal);
var q2 = line.IndexOf(";\"", StringComparison.Ordinal);
if (q1 == -1 || q2 == -1) continue;
var ent = line.Substring(9, q1 - 11).Trim();
if (ent == "quot" || ent == "gt" || ent == "lt" || ent == "amp" || ent == "apos")
{
continue; //done automatically by the XmlWriter in ToString
}
var v = line.Substring(q1 + 3, q2 - q1 - 3);
var key = int.Parse(v, CultureInfo.InvariantCulture);
entities.Add(key, ent);
}
}
}
return entities;
}
static XmlReaderSettings XmlReaderSettings
{
get
{
var readerSettings = new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Parse,
XmlResolver = new HtmlResolver(),
//.net 4 includes the full XHtml 1.0 DTD
//XmlResolver = new XmlPreloadedResolver(XmlKnownDtds.Xhtml10),
};
return readerSettings;
}
}
}
}