Web site link checker
- See also SiteScraper which downloads/spiders websites
- Uses LinkQueue to manage the link queue.
- Uses Scraper for html parsing
using System;
using System.Collections.Generic;
namespace Library.Web
{
public static class Site404Checker
{
/// <summary>
/// Read a website and check all links. This will be slow for large sites!
/// </summary>
public static List<string> CheckPage(Uri uri)
{
if (uri == null) throw new ArgumentNullException("uri");
List<string> brokenLinks = new List<string>();
LinkQueue linkQueue = new LinkQueue();
linkQueue.Add(uri);
linkQueue.CheckSameHost = false; //check external links (doesn't spider them)
linkQueue.CheckSameOrChildDirectory = true; //but only check a subsection of our site
Scraper scraper = new Scraper();
Uri webUri;
while ((webUri = linkQueue.DequeueToUri()) != null)
{
if (!scraper.IsThere(webUri)) //head request so should be fast - note may need proxy
{
brokenLinks.Add(webUri.AbsoluteUri);
continue;
}
if (!LinkQueue.IsSameHost(uri, webUri)) continue; //don't follow external links
if (!Scraper.IsText(scraper.ContentType)) continue;
//it's html, so we need it
string html = scraper.DownloadString(webUri);
linkQueue.BuildLinkQueue(webUri, html);
}
return brokenLinks;
}
}
}