deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit ad589588caefca329d4bbcd5778652f340a8e855
parent e332d0be7bfc6cf3d91de13b4bfc7584c3617a4d
Author: Stefan <cct@stefan-koch.name>
Date:   Tue, 22 Jan 2013 21:59:11 +0100

add basic crawler

Diffstat:
Acrawler.py | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+), 0 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -0,0 +1,54 @@ +from BeautifulSoup import BeautifulSoup +import urllib2, urlparse +import re +import time + +class Crawler(object): + def __init__(self, init_url): + self.pages = [init_url] + self.done = set() + + self.url_match = None + self.wait_time = 0 + + def set_url_restrict(self, regexp): + self.url_match = re.compile(regexp) + + def set_wait_time(self, seconds): + self.wait_time = seconds + + def crawl(self): + while len(self.pages) > 0: + time.sleep(self.wait_time) + + next_url = self.pages.pop() + print next_url + + try: + html = self.visit_url(next_url) + except: + continue + finally: + self.done.add(next_url) + + try: + for page in self.extract_urls(html): + page = urlparse.urljoin(next_url, page) + + if not page in self.done and self.url_match.search(page): + self.pages.append(page) + except UnicodeEncodeError: + pass + + def visit_url(self, url): + return urllib2.urlopen(url) + + def extract_urls(self, page): + soup = BeautifulSoup(page) + return [link.get('href') for link in soup.findAll('a')] + + +c = Crawler("http://stefan-koch.name/") +c.set_url_restrict("http://stefan-koch.name/.+") +c.set_wait_time(5) +c.crawl()