Web scraping

A utility class for web scraping. WebClient.DownloadString and WebClient.DownloadFile are easier, but you often need to timeout the web requests, so you have to use WebRequest. Also includes regex to pull out images, links etc. See also Website scraper that uses this to loop through a page or website section.

See also Site404Checker which checks links
See also SiteScraper which downloads/spiders websites
See also LinkQueue to manage the link queue.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

namespace Library.Web
{
    /// <summary>
    /// Utility methods to screen-scrape webpages using WebRequest
    /// </summary>
    /// <remarks>
    /// For spidering whole pages and sites, see <see cref="Library.Web.SiteScraper"/> which uses this internally.
    /// </remarks>
    public class Scraper
    {
        public Scraper()
        {
            Timeout = 5000;
            //use default credentials if there's a proxy
            WebRequest.DefaultWebProxy.Credentials = CredentialCache.DefaultNetworkCredentials;
        }

        #region Properties
        private int _timeout;
        public int Timeout
        {
            get { return _timeout; }
            set { _timeout = value; }
        }

        private string _contentType;
        public string ContentType
        {
            get { return _contentType; }
        }
        private long _contentLength;
        public long ContentLength
        {
            get { return _contentLength; }
        }
        private DateTime _lastModified;
        public DateTime LastModified
        {
            get { return _lastModified; }
        }
        private HttpStatusCode _statusCode;
        public HttpStatusCode StatusCode
        {
            get { return _statusCode; }
        }
        private Uri _responseUri;
        public Uri ResponseUri
        {
            get { return _responseUri; }
        }
        #endregion

        /// <summary>
        /// Update our properties from the WebResponse
        /// </summary>
        public void CheckWebResponse(WebResponse resp)
        {
            _responseUri = resp.ResponseUri;
            _contentType = resp.ContentType;
            //html loaded from disc has octet-stream so we reset it
            if (_contentType.Equals("application/octet-stream") && _responseUri.AbsolutePath.EndsWith(".html"))
                _contentType = "text/html";
            _contentLength = resp.ContentLength;
            //files don't return HttpStatusCode
            _statusCode = HttpStatusCode.Unused;
            if (ContentLength > 0) _statusCode = HttpStatusCode.OK;
            HttpWebResponse httpResp = resp as HttpWebResponse;
            if (httpResp != null)
            {
                _statusCode = httpResp.StatusCode;
                _lastModified = httpResp.LastModified;
            }
        }

        /// <summary>
        /// Download a text page (eg html). Equivalent to WebClient.DownloadString but uses WebRequest so has a timeout.
        /// </summary>
        public string DownloadString(Uri uri)
        {
            //we could use WebClient.DownloadString but that has no timeout control
            if (uri == null) throw new ArgumentNullException("uri");
            WebRequest request = WebRequest.Create(uri);
            request.Timeout = Timeout;
            try
            {
                using (WebResponse resp = request.GetResponse())
                    return DownloadString(resp);
            }
            catch (WebException we)//timeout or page not available
            {
                Debug.WriteLine(uri.AbsolutePath + " " + we.Message);
                return null;
            }
        }

        /// <summary>
        /// Download a text page (eg html). Equivalent to WebClient.DownloadString. Overload for an open WebResponse- has no internal error trap.
        /// </summary>
        public string DownloadString(WebResponse resp)
        {
            CheckWebResponse(resp);
            if (IsText(ContentType)) //don't read images/pdfs
            {
                Debug.WriteLine("DownloadString " + ResponseUri.AbsolutePath);
                using (Stream stream = resp.GetResponseStream())
                {
                    using (StreamReader sr = new StreamReader(stream))
                        return sr.ReadToEnd();
                }
            }
            return null;
        }

        /// <summary>
        /// Download any file (html/image/pdf etc). Equivalent to WebClient.DownloadFile but uses WebRequest so has a timeout.
        /// </summary>
        public bool DownloadFile(Uri uri, string fileName)
        {
            if (uri == null) throw new ArgumentNullException("uri");
            WebRequest request = WebRequest.Create(uri);
            request.Timeout = Timeout;
            try
            {
                using (WebResponse resp = request.GetResponse())
                    DownloadFile(resp, fileName);
                return true;
            }
            catch (WebException we)//timeout or page not available
            {
                Debug.WriteLine(uri.AbsolutePath + " " + we.Message);
                return false;
            }
        }

        /// <summary>
        /// Download any file (html/image/pdf etc). Equivalent to WebClient.DownloadFile. Overload for an open WebResponse- has no internal error trap
        /// </summary>
        public void DownloadFile(WebResponse resp, string fileName)
        {
            CheckWebResponse(resp);
            Debug.WriteLine("DownloadFile " + ResponseUri.AbsolutePath + " to " + fileName);
            CheckDirectory(fileName);
            using (FileStream writeStream = new FileStream(fileName, FileMode.Create, FileAccess.Write))
            {
                using (Stream respStream = resp.GetResponseStream())
                    CopyStream(respStream, writeStream);
            }
        }

        /// <summary>
        /// Does an HTTP HEAD request and returns true if exists.
        /// </summary>
        public bool IsThere(Uri uri)
        {
            if (uri == null) throw new ArgumentException("uri");
            //we could use WebClient.DownloadString but that has no timeout control
            WebRequest request = WebRequest.Create(uri);
            request.Method = "HEAD";
            if (Proxy != null) request.Proxy = Proxy;
            request.Timeout = Timeout;
            try
            {
                using (WebResponse resp = request.GetResponse())
                {
                    CheckWebResponse(resp);
                    if (StatusCode == HttpStatusCode.OK) return true;
                }
            }
            catch (WebException)//timeout or page not available
            {
                return false;
            }
            return false;
        }

        private IWebProxy _proxy;
        public IWebProxy Proxy
        {
            get { return _proxy; }
            set { _proxy = value; }
        }

        #region Utilities
        internal static void CheckDirectory(string fileName)
        {
            string dir = Path.GetDirectoryName(fileName);
            if (!Directory.Exists(dir)) Directory.CreateDirectory(dir);
        }

        /// <summary>
        /// Simple copy stream utility with 4k buffer.
        /// </summary>
        public static void CopyStream(Stream input, Stream output)
        {
            CopyStream(input, output, 4096);
        }
        /// <summary>
        /// Simple copy stream utility with explicit buffer size
        /// </summary>
        public static void CopyStream(Stream input, Stream output, int bufferSize)
        {
            byte[] bytes = new byte[bufferSize];
            int numBytes;
            while ((numBytes = input.Read(bytes, 0, bufferSize)) > 0)
                output.Write(bytes, 0, numBytes);
        }

        /// <summary>
        /// Determines whether the specified MIME type is text.
        /// </summary>
        public static bool IsText(string mimeType)
        {
            return mimeType.Equals("text/html", StringComparison.OrdinalIgnoreCase)
                || mimeType.Equals("text/plain", StringComparison.OrdinalIgnoreCase)
                || mimeType.Equals("text/css", StringComparison.OrdinalIgnoreCase)
                || mimeType.Equals("text/javascript", StringComparison.OrdinalIgnoreCase)
                || mimeType.Equals("application/x-javascript", StringComparison.OrdinalIgnoreCase);
        }
        /// <summary>
        /// Determines whether the specified file extension is an image (or PDF)
        /// </summary>
        public static bool IsImage(string absolutePath)
        {
            return absolutePath.EndsWith(".jpg", StringComparison.OrdinalIgnoreCase)
                || absolutePath.EndsWith(".jpeg", StringComparison.OrdinalIgnoreCase)
                || absolutePath.EndsWith(".gif", StringComparison.OrdinalIgnoreCase)
                || absolutePath.EndsWith(".png", StringComparison.OrdinalIgnoreCase)
                || absolutePath.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
        }
        #endregion

        #region Static Regex Matches
        /// <summary>
        /// Parses (with regex) an html string for <img> tags and returns a list of the uris (adjusts relative uris to the uri input parameter)
        /// </summary>
        public static List<Uri> GetImageUris(Uri uriBase, string html)
        {
            const string imgs =
                "<img[^>]+src\\s*=\\s*(?:\"(?<src>[^\"]*)\"|'(?<src>[^']*)'|(?<src>[^\"'>\\s]+))[^>]*>";
            return GetMatches(uriBase, html, imgs, "src");
        }

        /// <summary>
        /// Parses (with regex) an html string for <a href> tags and returns a list of the uris (adjusts relative uris to the uri input parameter)
        /// </summary>
        public static List<Uri> GetAnchorUris(Uri uriBase, string html)
        {
            const string links =
                "<a[^>]+href\\s*=\\s*(?:\"(?<href>[^\"]*)\"|'(?<href>[^']*)'|(?<href>[^\"'>\\s]+))[^>]*>";
            return GetMatches(uriBase, html, links, "href");
        }

        public static List<Uri> GetCssUris(Uri uriBase, string html)
        {
            const string links =
                "<link[^>]+href\\s*=\\s*(?:\"(?<href>[^\"]*)\"|'(?<href>[^']*)'|(?<href>[^\"'>\\s]+))[^>]*>";
            return GetMatches(uriBase, html, links, "href");
        }

        public static List<Uri> GetEmbedUris(Uri uriBase, string html)
        {
            const string embeds =
                "<embed[^>]+src\\s*=\\s*(?:\"(?<src>[^\"]*)\"|'(?<src>[^']*)'|(?<src>[^\"'>\\s]+))[^>]*>";
            return GetMatches(uriBase, html, embeds, "src");
        }

        /// <summary>
        /// Parses an html string using a regex pattern and group name and returns a list of the uris (adjusts relative uris to the uri input parameter)
        /// </summary>
        public static List<Uri> GetMatches(Uri uriBase, string html, string regexPattern, string groupName)
        {
            List<Uri> result = new List<Uri>();
            Regex finder = new Regex(regexPattern, RegexOptions.IgnoreCase);
            foreach (Match m in finder.Matches(html))
            {
                string g = m.Groups[groupName].Value;
                Uri newUri = new Uri(uriBase, g);
                if (!result.Contains(newUri)) result.Add(newUri);
            }
            return result;
        }
        #endregion
    }
}