Web site scraper
Utility to scrape web pages and sites by spidering links in html
- Uses LinkQueue to manage the link queue.
- Uses Scraper for core html downloading
- See also Site404Checker which checks links
using System;
using System.Diagnostics;
using System.IO;
using System.Net;
namespace Library.Web
{
/// <summary>
/// Utility to scrape web pages and sites by spidering links in html
/// </summary>
/// <remarks>
/// Uses <see cref="Library.Web.Scraper"/> for basic functions. For simplicity, it's not an inheritance relationship. Uses <see cref="Library.Web.LinkQueue"/> to build and manage the queue.
/// </remarks>
public class SiteScraper
{
protected Uri _startDownloadUri;
protected Uri _startWebUri;
protected int _timeout = 5000;
/// <summary>
/// Manages the queue. Extract interface and use strategy pattern for different parsing options.
/// </summary>
protected LinkQueue linkQueue = new LinkQueue();
public int Timeout
{
get { return _timeout; }
set { _timeout = value; }
}
/// <summary>
/// Fires when about to download a file. You can cancel downloading
/// </summary>
/// <example><code>
/// siteScraper.OnDownloadFile += delegate(object sender, SiteScraper.DownloadEventArgs e)
/// {
/// if (e.DownloadUri.AbsolutePath.Contains("unwantedsection")) e.Cancel = true;
/// }
/// </code></example>
public event EventHandler<DownloadEventArgs> OnDownloadFile;
/// <summary>
/// Download an html webpage and linked images that are on the same base address
/// </summary>
public bool DownloadPageAndImages(Uri uri, string fileName)
{
if (uri == null) throw new ArgumentNullException("uri");
_startDownloadUri = new Uri(fileName);
_startWebUri = uri;
linkQueue.Clear();
if (!RaiseDownloadEvent(uri)) return false;
WebRequest request = WebRequest.Create(uri);
request.Timeout = Timeout;
string html;
try
{
using (WebResponse resp = request.GetResponse())
{
html = Download(resp, fileName);
if (string.IsNullOrEmpty(html))
return true; //it's an image, just download it
}
}
catch (WebException we) //timeout or page not available
{
Debug.WriteLine(we.Message);
return false;
}
linkQueue.BuildNonPageLinkQueue(uri, html);
DownloadAllLinks();
return true;
}
/// <summary>
/// Download an html webpage and linked resources that are on the same base address
/// </summary>
public void DownloadWebsite(Uri uri, string fileName)
{
if (uri == null) throw new ArgumentNullException("uri");
_startWebUri = uri;
linkQueue.Clear();
_startDownloadUri = new Uri(fileName);
linkQueue.Add(uri); //so it's not selected again
//DownloadFromQueue(uri, fileName);
Uri webUri;
while ((webUri = linkQueue.DequeueToUri()) != null)
{
string downloadPath = DownloadFilePath(webUri);
Debug.WriteLine("DownloadWebsite " + webUri.AbsoluteUri + " " + downloadPath);
DownloadFromQueue(webUri, downloadPath);
}
}
/// <summary>
/// Download a uri to the fileName
/// </summary>
private void DownloadFromQueue(Uri uri, string fileName)
{
if (!RaiseDownloadEvent(uri)) return;
WebRequest request = WebRequest.Create(uri);
request.Timeout = Timeout;
string html = string.Empty;
try
{
using (WebResponse resp = request.GetResponse())
{
html = Download(resp, fileName);
if (string.IsNullOrEmpty(html))
return; //it's an image, just download it
}
}
catch (WebException we) //timeout or page not available
{
Debug.WriteLine(we.Message);
}
linkQueue.BuildLinkQueue(uri, html);
}
/// <summary>
/// Download from webresponse to filename
/// </summary>
public string Download(WebResponse resp, string fileName)
{
Scraper scraper = new Scraper();
scraper.CheckWebResponse(resp);
Debug.WriteLine("Download " + scraper.ResponseUri.AbsolutePath + " to " + fileName);
Scraper.CheckDirectory(fileName);
if (Scraper.IsText(scraper.ContentType))
{
using (Stream stream = resp.GetResponseStream())
{
using (StreamReader sr = new StreamReader(stream))
{
string html = sr.ReadToEnd(); //get the html text before we save it so we can parse
File.WriteAllText(fileName, html);
return html;
}
}
}
//not html, let's just save it
using (FileStream writeStream = new FileStream(fileName, FileMode.Create, FileAccess.Write))
{
using (Stream respStream = resp.GetResponseStream())
Scraper.CopyStream(respStream, writeStream);
}
return null;
}
/// <summary>
/// Download all links using <see cref="Scraper"/>
/// </summary>
private void DownloadAllLinks()
{
Scraper scraper = new Scraper();
foreach (string link in linkQueue.Links)
{
//we need to rebase here from relative path to original
Uri webUri = new Uri(link);
if (!RaiseDownloadEvent(webUri)) continue;
string downloadPath = DownloadFilePath(webUri);
Debug.WriteLine("ReadAllImages " + link + " " + downloadPath);
scraper.DownloadFile(webUri, downloadPath);
}
}
/// <summary>
/// Get the actual download path - from web item (itemUri) to the initial weblocation (startUri)
/// and applies relative location to download path (downloadRoot)
/// </summary>
private string DownloadFilePath(Uri itemUri)
{
Uri rel = _startWebUri.MakeRelativeUri(itemUri); //get a relative uri
Uri imgFile = new Uri(_startDownloadUri, rel); //apply it to the download location
return imgFile.LocalPath;
}
private bool RaiseDownloadEvent(Uri uri)
{
EventHandler<DownloadEventArgs> eventHandler = OnDownloadFile;
if (eventHandler != null)
{
DownloadEventArgs args = new DownloadEventArgs(uri);
eventHandler(this, args);
if (args.Cancel) return false;
}
return true;
}
#region Nested type: DownloadEventArgs
/// <summary>
/// A cancelable eventargs. Could have been based on ComponentModel.CancelEventArgs if I felt like it.
/// </summary>
public class DownloadEventArgs : EventArgs
{
private readonly Uri _downloadUri;
private bool _cancel;
public DownloadEventArgs(Uri uri)
{
_downloadUri = uri;
}
public bool Cancel
{
get { return _cancel; }
set { _cancel = true; }
}
public Uri DownloadUri
{
get { return _downloadUri; }
}
}
#endregion
}
}