Crawling is a thing which all search engines do across the web. This is a simple web crawler which crawls the the page you give and will give you back all the links on that page. Here for the sake of example I took Google.com.
using System;
using System.Net;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
public class Crawler
{
public static void Main()
{
string url = "http://www.google.com";
HttpWebRequest httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
httpWebRequest.UserAgent = "Anurag's Crawler";
WebResponse webResponse = httpWebRequest.GetResponse();
Stream stream = webResponse.GetResponseStream();
StreamReader streamReader = new StreamReader(stream);
string htmlText = streamReader.ReadToEnd();
var allLinks = GetNewLinks(htmlText);
foreach (var link in allLinks)
{
Console.WriteLine(link);
}
}
private static List GetNewLinks(string content)
{
Regex regexForLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
List<string> newLinks = new List<string>(); foreach (var match in regexLink.Matches(content)) { if (!newLinks.Contains(match.ToString())) newLinks.Add(match.ToString()); } return newLinks; } }
Web Crawler This is a crawler written using Reactive Extension
One more web crawler thats available and bit complex Archnode.net
No comments:
Post a Comment