add basic crawler - deadlink-crawler - [unmaintained] crawls a site to detect dead links

commit ad589588caefca329d4bbcd5778652f340a8e855
parent e332d0be7bfc6cf3d91de13b4bfc7584c3617a4d
Author: Stefan <cct@stefan-koch.name>
Date:   Tue, 22 Jan 2013 21:59:11 +0100

add basic crawler

Diffstat:
A crawler.py  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 54 insertions(+), 0 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,54 @@
+from BeautifulSoup import BeautifulSoup
+import urllib2, urlparse
+import re
+import time
+
+class Crawler(object):
+	def __init__(self, init_url):
+		self.pages = [init_url]
+		self.done = set()
+		
+		self.url_match = None
+		self.wait_time = 0
+	
+	def set_url_restrict(self, regexp):
+		self.url_match = re.compile(regexp)
+	
+	def set_wait_time(self, seconds):
+		self.wait_time = seconds
+	
+	def crawl(self):
+		while len(self.pages) > 0:
+			time.sleep(self.wait_time)
+			
+			next_url = self.pages.pop()
+			print next_url
+			
+			try:
+				html = self.visit_url(next_url)
+			except:
+				continue
+			finally:
+				self.done.add(next_url)
+			
+			try:
+				for page in self.extract_urls(html):
+					page = urlparse.urljoin(next_url, page)
+					
+					if not page in self.done and self.url_match.search(page):
+						self.pages.append(page)
+			except UnicodeEncodeError:
+				pass
+	
+	def visit_url(self, url):
+		return urllib2.urlopen(url)
+	
+	def extract_urls(self, page):
+		soup = BeautifulSoup(page)
+		return [link.get('href') for link in soup.findAll('a')]
+	
+
+c = Crawler("http://stefan-koch.name/")
+c.set_url_restrict("http://stefan-koch.name/.+")
+c.set_wait_time(5)
+c.crawl()

	deadlink-crawler [unmaintained] crawls a site to detect dead links
	Log \| Files \| Refs \| README