pages

Sunday 22 November 2015

Writing a simple web crawler

Crawling is a thing which all search engines do across the web. This is a simple web crawler which crawls the the page you give and will give you back all the links on that page. Here for the sake of example I took Google.com.



using System;
using System.Net;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;


public class Crawler
{
	public static void Main()
	{
	string url = "http://www.google.com";
	HttpWebRequest httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
        httpWebRequest.UserAgent = "Anurag's Crawler";
        WebResponse webResponse = httpWebRequest.GetResponse();
        Stream stream = webResponse.GetResponseStream();
        StreamReader streamReader = new StreamReader(stream);
        string htmlText = streamReader.ReadToEnd();
	var allLinks = GetNewLinks(htmlText);
		foreach (var link in allLinks)
		{
			Console.WriteLine(link);
		}		
	}
	
	
	
	private static List GetNewLinks(string content)
{
    Regex regexForLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
    List<string> newLinks = new List<string>(); 
    foreach (var match in regexLink.Matches(content))
    {
        if (!newLinks.Contains(match.ToString()))
            newLinks.Add(match.ToString());
    }

    return newLinks;
}
}


Web Crawler  This is a crawler written using Reactive Extension
One more web crawler thats available and bit complex Archnode.net

No comments:

Post a Comment