Read XHTML into XDocument

HTML can be well-formed xml (XHTML 1.0, XHTML 1.1 and HTML5 as XHTML), but xml parsers don't understand the HTML named entities (such as  ). This is a class for reading well-formed XHTML - with or without a DOCTYPE - into a .net XDocument, as well as saving it, preserving the named entities.

Read in html string with static var doc = HtmlDocument.ParseHtml(html)
Use standard XDocument selects/updates, with the namespace (doc.XHtml or for namespace manager doc.Ns with "html:" prefix, as applicable)
Get the html with doc.WriteHtml() which automatically handles &, >, <, ",  .
If you want all HTML entities expanded as entities, use static html = HtmlDocument.EntitizeHtml(html)
If it's not well formed HTML, use Html Agility Pack instead

Example use

//read the text
var html = File.ReadAllText(path);

//parse it into an XDocument
var doc = HtmlDocument.ParseHtml(html);

//always use the namespace (in HtmlDocument.XHtml)
var inputs = doc.Descendants(doc.XHtml + "input").First();
inputs.SetAttributeValue("autocomplete","off"); //change it

//use XPath with a namespace manager (html)
var footer = doc.XPathSelectElement("//html:footer", doc.Ns);
if (footer != null)
{
footer.SetValue("\xA9 MyCompany"); //©
}

//get the updated html string
html = doc.WriteHtml();
//fix other named entities (if required- not needed for >,<,&quot, )
html = HtmlDocument.EntitizeHtml(html);
//save it
File.WriteAllText(path, html);

Code

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Xml;
using System.Xml.Linq;

namespace Library
{
    /// <summary>An HTML page as a XDocument. Use <see cref="ParseHtml"/> to load html, <see cref="WriteHtml"/> to write</summary>
    public class HtmlDocument : XDocument
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="HtmlDocument"/> class.
        /// </summary>
        /// <param name="other">The <see cref="T:System.Xml.Linq.XDocument" /> object that will be copied.</param>
        public HtmlDocument(XDocument other)
            : base(other)
        {
        }

        /// <summary>
        /// The namespace. Also select elements with this name (eg doc.XHtml + "body")
        /// </summary>
        public XNamespace XHtml { get; set; }

        /// <summary>
        /// Namespace manager used for XPath queries
        /// </summary>
        public XmlNamespaceManager Ns { get; internal set; }

        /// <summary>
        /// Convert a string of html into an XDocument. HTML must be well-formed xml, but could have any common Doctype including Html5 (or no DocType)
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static HtmlDocument ParseHtml(string html)
        {
            html = SanitizeHtml(html);
            using (var reader = XmlReader.Create(new StringReader(html.Trim()), XmlReaderSettings))
            {
                var xDocument = Load(reader, LoadOptions.PreserveWhitespace);
                var doc = new HtmlDocument(xDocument);
                if (doc.Root != null)
                {
                    //get the xmlns (maybe absent or not official html)
                    var ns = doc.Root.Name.Namespace;
                    doc.XHtml = ns;
                }
                if (reader.NameTable != null)
                {
                    doc.Ns = new XmlNamespaceManager(reader.NameTable);
                    doc.Ns.AddNamespace("html", doc.XHtml.ToString());
                }

                //InternalSubset is an empty string and should be null (other dtd has "[]" at the end)
                if (doc.DocumentType != null && string.IsNullOrEmpty(doc.DocumentType.InternalSubset))
                {
                    doc.DocumentType.InternalSubset = null;
                }

                return doc;
            }
        }

        static string SanitizeHtml(string html)
        {
            if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");
            html = html.Trim();
            //look for a DocType
            var start = html.IndexOf("<!DOCTYPE", StringComparison.OrdinalIgnoreCase);
            //1. There is no DocType - add XHtml 1 Transitional
            if (start == -1)
            {
                //assume no <?xml version=""1.0""?>
                return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">

" + html;
            }
            //2. There is an Html5 DocType - swap it for XHtml 1 Transitional
            var end = html.IndexOf(">", start, StringComparison.OrdinalIgnoreCase);
            if (end - start <= "<!DOCTYPE html>".Length)
            {
                return @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Transitional//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"">

" + html.Substring(end + 1);
            }
            //3. There is an other Html5 DocType - assume it's ok.
            return html;
        }

        /// <summary>
        /// Writes the HTML. Only xml named entities plus nbsp are explicitly written.
        /// </summary>
        /// <returns></returns>
        public string WriteHtml()
        {
            var html = ToString();
            //does it have our marker comment?
            var startMarker = html.IndexOf("", StringComparison.OrdinalIgnoreCase))
                {
                    //reinsert the html5 doctype
                    html = "<!DOCTYPE html>\r\n" + html;
                }
                else if (string.Equals(marker, "", StringComparison.OrdinalIgnoreCase))
                {
                    //no doctype
                }
            }
            //for non-breaking space only, show the named entity.
            html = html.Replace("\xA0", " ");
            //for other entities, use EntitizeHtml method

            return html;
        }

        /// <summary>
        /// Replaces resolved characters with the corresponding html named entity
        /// </summary>
        public static string EntitizeHtml(string html)
        {
            if (string.IsNullOrEmpty(html)) throw new ArgumentNullException("html");

            var entities = LoadHtmlEntities();
            foreach (var keyPair in entities)
            {
                var ch = (char)keyPair.Key;
                html = html.Replace(ch.ToString(), "&" + keyPair.Value + ";");
            }
            return html;
        }

        static Dictionary<int, string> LoadHtmlEntities()
        {
            var entities = new Dictionary<int, string>();
            using (
                var stream =
                    System.Reflection.Assembly.GetExecutingAssembly()
                        .GetManifestResourceStream("Library.ParseXHtml.xhtml-entities.ent"))
            {
                if (stream == null) return entities;
                using (var sr = new StreamReader(stream))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (!line.StartsWith("<!ENTITY", StringComparison.OrdinalIgnoreCase)) continue;
                        var q1 = line.IndexOf("\"&#", StringComparison.Ordinal);
                        var q2 = line.IndexOf(";\"", StringComparison.Ordinal);
                        if (q1 == -1 || q2 == -1) continue;
                        var ent = line.Substring(9, q1 - 11).Trim();
                        if (ent == "quot" || ent == "gt" || ent == "lt" || ent == "amp" || ent == "apos")
                        {
                            continue; //done automatically by the XmlWriter in ToString
                        }
                        var v = line.Substring(q1 + 3, q2 - q1 - 3);
                        var key = int.Parse(v, CultureInfo.InvariantCulture);
                        entities.Add(key, ent);
                    }
                }
            }
            return entities;
        }

        static XmlReaderSettings XmlReaderSettings
        {
            get
            {
                var readerSettings = new XmlReaderSettings
                {
                    DtdProcessing = DtdProcessing.Parse,
                    XmlResolver = new HtmlResolver(),
                    //.net 4 includes the full XHtml 1.0 DTD
                    //XmlResolver = new XmlPreloadedResolver(XmlKnownDtds.Xhtml10),
                };
                return readerSettings;
            }
        }
    }
}

static void