Web site scraper Link Queue
- Used by SiteScraper which downloads/spiders websites
- Used by Site404Checker which checks links
- Uses Scraper for html parsing
using System;
using System.Collections.Generic;
namespace Library.Web
{
/// <summary>
/// Manages a queue of links within a website. Use <see cref="BuildLinkQueue"/> to add html (from a Uri) to the queue (it will check duplicates). Use <see cref="Dequeue"/> to remove them from the queue.
/// </summary>
/// <remarks>
/// Uses <see cref="Library.Web.Scraper"/> for static functions to parse the html with regexes.
/// </remarks>
public class LinkQueue
{
/// <summary>
/// Links is all links- used for duplicate checking
/// </summary>
protected readonly List<string> _links = new List<string>();
/// <summary>
/// Queue is the actual queue. Always Dequeue
/// </summary>
protected readonly Queue<string> _queue = new Queue<string>();
public LinkQueue()
{
CheckSameHost = true;
CheckSameOrChildDirectory = true;
}
public void Clear()
{
_links.Clear();
_queue.Clear();
}
/// <summary>
/// Manually add an item to the queue. Use for first webpage- afterwards scan the html with <see cref="BuildLinkQueue"/>
/// </summary>
public void Add(Uri uri)
{
_links.Add(uri.AbsoluteUri);
_queue.Enqueue(uri.AbsoluteUri);
}
public void Add(string uri)
{
_links.Add(uri);
_queue.Enqueue(uri);
}
/// <summary>
/// Removes and returns the string at the beginning of the queue
/// </summary>
public string Dequeue()
{
string address;
try
{
address = _queue.Dequeue();
}
catch (InvalidOperationException)
{
return null; //the queue is empty
}
return address;
}
public Uri DequeueToUri()
{
string address = Dequeue();
if (string.IsNullOrEmpty(address)) return null;
return new Uri(address);
}
/// <summary>
/// Use this to iterate over all the links (whatever the state of the queue)
/// </summary>
public IEnumerable<string> Links
{
get { return _links; }
}
/// <summary>
/// Get all the anchors, images, css and embeds (TODO: add js if required)
/// </summary>
public void BuildLinkQueue(Uri baseuri, string html)
{
BuildLinkQueueFromList(baseuri, Scraper.GetAnchorUris(baseuri, html));
BuildLinkQueueFromList(baseuri, Scraper.GetImageUris(baseuri, html));
BuildLinkQueueFromList(baseuri, Scraper.GetCssUris(baseuri, html));
BuildLinkQueueFromList(baseuri, Scraper.GetEmbedUris(baseuri, html));
}
/// <summary>
/// Grabs all the images, builds a list to exclude dupes, and tries to download them
/// </summary>
public void BuildNonPageLinkQueue(Uri baseuri, string html)
{
List<Uri> anchorUris = Scraper.GetAnchorUris(baseuri, html);
foreach (Uri u in anchorUris)
{
if (!Scraper.IsImage(u.AbsolutePath)) continue;
if (!CheckAllowed(baseuri, u)) continue;
AddToQueue(u);
}
BuildLinkQueueFromList(baseuri, Scraper.GetImageUris(baseuri, html));
BuildLinkQueueFromList(baseuri, Scraper.GetCssUris(baseuri, html));
BuildLinkQueueFromList(baseuri, Scraper.GetEmbedUris(baseuri, html));
}
/// <summary>
/// Create the link queue if on same domain/ a child directory
/// </summary>
private void BuildLinkQueueFromList(Uri baseuri, IEnumerable<Uri> list)
{
foreach (Uri u in list)
{
if (!CheckAllowed(baseuri, u)) continue;
AddToQueue(u);
}
}
private void AddToQueue(Uri u)
{
string absoluteUri = u.AbsoluteUri;
if (!_links.Contains(absoluteUri))
{
_links.Add(absoluteUri);
_queue.Enqueue(absoluteUri);
}
}
private bool CheckAllowed(Uri baseuri, Uri u)
{
if (!IsSameHost(baseuri, u))
{
if (CheckSameHost) return false;
}
else //same host, check same or child
{
if (CheckSameOrChildDirectory && !IsSameOrChildDirectory(baseuri, u)) return false;
}
return true;
}
public static bool IsSameHost(Uri baseuri, Uri u)
{
return (u.Host.Equals(baseuri.Host, StringComparison.OrdinalIgnoreCase));
}
public static bool IsSameOrChildDirectory(Uri baseuri, Uri u)
{
return (baseuri.IsBaseOf(u));
}
public bool CheckSameHost { get; set; }
public bool CheckSameOrChildDirectory { get; set; }
}
}