Web site scraper

Utility to scrape web pages and sites by spidering links in html

Uses LinkQueue to manage the link queue.
Uses Scraper for core html downloading
See also Site404Checker which checks links

using System;
using System.Diagnostics;
using System.IO;
using System.Net;

namespace Library.Web
{
    /// <summary>
    /// Utility to scrape web pages and sites by spidering links in html
    /// </summary>
    /// <remarks>
    /// Uses <see cref="Library.Web.Scraper"/> for basic functions. For simplicity, it's not an inheritance relationship. Uses <see cref="Library.Web.LinkQueue"/> to build and manage the queue.
    /// </remarks>
    public class SiteScraper
    {
        protected Uri _startDownloadUri;
        protected Uri _startWebUri;
        protected int _timeout = 5000;
        /// <summary>
        /// Manages the queue. Extract interface and use strategy pattern for different parsing options.
        /// </summary>
        protected LinkQueue linkQueue = new LinkQueue();

        public int Timeout
        {
            get { return _timeout; }
            set { _timeout = value; }
        }

        /// <summary>
        /// Fires when about to download a file. You can cancel downloading
        /// </summary>
        /// <example><code>
        /// siteScraper.OnDownloadFile += delegate(object sender, SiteScraper.DownloadEventArgs e)
        /// {
        ///    if (e.DownloadUri.AbsolutePath.Contains("unwantedsection")) e.Cancel = true;
        /// }
        /// </code></example>
        public event EventHandler<DownloadEventArgs> OnDownloadFile;

        /// <summary>
        /// Download an html webpage and linked images that are on the same base address
        /// </summary>
        public bool DownloadPageAndImages(Uri uri, string fileName)
        {
            if (uri == null) throw new ArgumentNullException("uri");
            _startDownloadUri = new Uri(fileName);
            _startWebUri = uri;
            linkQueue.Clear();

            if (!RaiseDownloadEvent(uri)) return false;
            WebRequest request = WebRequest.Create(uri);
            request.Timeout = Timeout;
            string html;
            try
            {
                using (WebResponse resp = request.GetResponse())
                {
                    html = Download(resp, fileName);
                    if (string.IsNullOrEmpty(html))
                        return true; //it's an image, just download it
                }
            }
            catch (WebException we) //timeout or page not available
            {
                Debug.WriteLine(we.Message);
                return false;
            }
            linkQueue.BuildNonPageLinkQueue(uri, html);
            DownloadAllLinks();
            return true;
        }

        /// <summary>
        /// Download an html webpage and linked resources that are on the same base address
        /// </summary>
        public void DownloadWebsite(Uri uri, string fileName)
        {
            if (uri == null) throw new ArgumentNullException("uri");
            _startWebUri = uri;
            linkQueue.Clear();

            _startDownloadUri = new Uri(fileName);

            linkQueue.Add(uri); //so it's not selected again

            //DownloadFromQueue(uri, fileName);
            Uri webUri;
            while ((webUri = linkQueue.DequeueToUri()) != null)
            {
                string downloadPath = DownloadFilePath(webUri);
                Debug.WriteLine("DownloadWebsite " + webUri.AbsoluteUri + " " + downloadPath);
                DownloadFromQueue(webUri, downloadPath);
            }
        }

        /// <summary>
        /// Download a uri to the fileName
        /// </summary>
        private void DownloadFromQueue(Uri uri, string fileName)
        {
            if (!RaiseDownloadEvent(uri)) return;

            WebRequest request = WebRequest.Create(uri);
            request.Timeout = Timeout;
            string html = string.Empty;
            try
            {
                using (WebResponse resp = request.GetResponse())
                {
                    html = Download(resp, fileName);
                    if (string.IsNullOrEmpty(html))
                        return; //it's an image, just download it
                }
            }
            catch (WebException we) //timeout or page not available
            {
                Debug.WriteLine(we.Message);
            }
            linkQueue.BuildLinkQueue(uri, html);
        }

        /// <summary>
        /// Download from webresponse to filename
        /// </summary>
        public string Download(WebResponse resp, string fileName)
        {
            Scraper scraper = new Scraper();
            scraper.CheckWebResponse(resp);
            Debug.WriteLine("Download " + scraper.ResponseUri.AbsolutePath + " to " + fileName);
            Scraper.CheckDirectory(fileName);
            if (Scraper.IsText(scraper.ContentType))
            {
                using (Stream stream = resp.GetResponseStream())
                {
                    using (StreamReader sr = new StreamReader(stream))
                    {
                        string html = sr.ReadToEnd(); //get the html text before we save it so we can parse
                        File.WriteAllText(fileName, html);
                        return html;
                    }
                }
            }
            //not html, let's just save it
            using (FileStream writeStream = new FileStream(fileName, FileMode.Create, FileAccess.Write))
            {
                using (Stream respStream = resp.GetResponseStream())
                    Scraper.CopyStream(respStream, writeStream);
            }
            return null;
        }

        /// <summary>
        /// Download all links using <see cref="Scraper"/>
        /// </summary>
        private void DownloadAllLinks()
        {
            Scraper scraper = new Scraper();
            foreach (string link in linkQueue.Links)
            {
                //we need to rebase here from relative path to original
                Uri webUri = new Uri(link);
                if (!RaiseDownloadEvent(webUri)) continue;
                string downloadPath = DownloadFilePath(webUri);
                Debug.WriteLine("ReadAllImages " + link + " " + downloadPath);
                scraper.DownloadFile(webUri, downloadPath);
            }
        }

        /// <summary>
        /// Get the actual download path - from web item (itemUri) to the initial weblocation (startUri)
        /// and applies relative location to download path (downloadRoot)
        /// </summary>
        private string DownloadFilePath(Uri itemUri)
        {
            Uri rel = _startWebUri.MakeRelativeUri(itemUri); //get a relative uri
            Uri imgFile = new Uri(_startDownloadUri, rel); //apply it to the download location
            return imgFile.LocalPath;
        }

        private bool RaiseDownloadEvent(Uri uri)
        {
            EventHandler<DownloadEventArgs> eventHandler = OnDownloadFile;
            if (eventHandler != null)
            {
                DownloadEventArgs args = new DownloadEventArgs(uri);
                eventHandler(this, args);
                if (args.Cancel) return false;
            }
            return true;
        }

        #region Nested type: DownloadEventArgs

        /// <summary>
        /// A cancelable eventargs. Could have been based on ComponentModel.CancelEventArgs if I felt like it.
        /// </summary>
        public class DownloadEventArgs : EventArgs
        {
            private readonly Uri _downloadUri;
            private bool _cancel;

            public DownloadEventArgs(Uri uri)
            {
                _downloadUri = uri;
            }

            public bool Cancel
            {
                get { return _cancel; }
                set { _cancel = true; }
            }

            public Uri DownloadUri
            {
                get { return _downloadUri; }
            }
        }

        #endregion
    }
}

static void

Web site scraper