deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit 4c7a665a5cc210d0daa0c0ddff0a8182939b4f91
parent ad589588caefca329d4bbcd5778652f340a8e855
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Wed, 23 Jan 2013 13:49:28 +0100

added deadlink list and analysis

Diffstat:
Mcrawler.py | 75++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -5,10 +5,22 @@ import time class Crawler(object): def __init__(self, init_url): - self.pages = [init_url] + # A list of to be crawled urls, where the second element + # in the tuple is the URL via which we found this URL + self.pages = [(init_url, None)] + + # Urls we have already visited self.done = set() + # List of deadlinks for each URL we have, + # i.e. url1: [deadlink1, deadlink2] + self.deadlinks = {} + + # Regular expression for URLs we are interested in (our internal + # URLs) self.url_match = None + + # Timeout in seconds to wait, so that we do not kill our server self.wait_time = 0 def set_url_restrict(self, regexp): @@ -22,26 +34,55 @@ class Crawler(object): time.sleep(self.wait_time) next_url = self.pages.pop() - print next_url try: - html = self.visit_url(next_url) - except: + self.visit_url(next_url) + except urllib2.URLError: continue finally: self.done.add(next_url) - - try: - for page in self.extract_urls(html): - page = urlparse.urljoin(next_url, page) - - if not page in self.done and self.url_match.search(page): - self.pages.append(page) - except UnicodeEncodeError: - pass - def visit_url(self, url): - return urllib2.urlopen(url) + def visit_url(self, url_tuple): + if self.url_match.search(url_tuple[0]): + self.visit_url_internal(url_tuple[0]) + else: + self.visit_url_external(url_tuple[0], url_tuple[1]) + + def visit_url_internal(self, url): + print("Crawling internal: %s" % url) + + html = urllib2.urlopen(url) + + try: + for page in self.extract_urls(html): + page = urlparse.urljoin(url, page) + + if not page in self.done: + self.pages.append((page, url)) + except UnicodeEncodeError: + pass + + def visit_url_external(self, url, found_via): + print("Trying external: %s" % url) + + request = urllib2.Request(url) + + try: + response = urllib2.urlopen(request) + except urllib2.HTTPError: + # We receive an exception in case of 404 + self.add_to_deadlinks(url, found_via) + return + + status = response.getcode() + if status != None and status >= 400: + self.add_to_deadlinks(url, found_via) + + def add_to_deadlinks(self, url, found_via): + self.deadlinks.setdefault(found_via, []) + self.deadlinks[found_via].append(url) + + print("Found new deadlink %s on %s" % (url, found_via)) def extract_urls(self, page): soup = BeautifulSoup(page) @@ -49,6 +90,6 @@ class Crawler(object): c = Crawler("http://stefan-koch.name/") -c.set_url_restrict("http://stefan-koch.name/.+") -c.set_wait_time(5) +c.set_url_restrict("http://stefan-koch.name/.*") +c.set_wait_time(1) c.crawl()