
This is a method I use when screen scraping to retrieve all hyperlinks from the generated HTML from a WebClient call
Instructions: Need a reference to the following Namespaces
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
/// <summary>
/// method for extracting all URL's from the data being
/// passed to the method. The data being passed will be all
/// the data from a provided URL
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public ArrayList ExtractLinks(string str)
{
try
{
//ArrayList to hold all the links
ArrayList linksList = new ArrayList();
//regex pattern for searching
string pattern = "href=\"[a-zA-Z./:&\\d_-]+\"";
//create a new RegEx object
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture);
//put all the matches into a MatchCollection
MatchCollection matches = reg.Matches(str);
//loop through all the matches
foreach (Match match in matches)
{
foreach (Group group in match.Groups)
{
//now we do some string manipulation to pull the "href=" off the link
string url = group.Value.Replace("href=\"", "");
url = url.Substring(0, url.IndexOf("\""));
//add the URL to the list
linksList.Add(url);
}
}
//now return the populated ArrayList
return linksList;
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
return null;
}
}-Offline- |