deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit 74728c39ce266ce4bf18270098064f918da480cd
parent ab9fcee143b68917dedb705b466813bee60c4b67
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Thu, 24 Jan 2013 14:12:52 +0100

also check own domain for deadlinks

Diffstat:
Mcrawler.py | 26+++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -39,23 +39,21 @@ class Crawler(object): next_url = self.pages.pop() try: - self.visit_url(next_url) + self.visit_url(next_url[0], next_url[1]) except urllib2.URLError: continue print("DEADLINKS") print(self.deadlinks) - def visit_url(self, url_tuple): - if self.url_match.search(url_tuple[0]): - self.visit_url_internal(url_tuple[0]) - else: - self.visit_url_external(url_tuple[0], url_tuple[1]) - - def visit_url_internal(self, url): - print("Crawling internal: %s" % url) + def visit_url(self, url, found_via): + response = self.check_url(url, found_via) - html = urllib2.urlopen(url) + if response != None and self.url_match.search(url): + self.collect_new_urls(url, response.read()) + + def collect_new_urls(self, url, html): + print("Fetching new URLs from: %s" % url) try: for page in self.extract_urls(html): @@ -67,8 +65,8 @@ class Crawler(object): except UnicodeEncodeError: pass - def visit_url_external(self, url, found_via): - print("Trying external: %s" % url) + def check_url(self, url, found_via): + print("Trying URL: %s" % url) request = urllib2.Request(url) @@ -77,11 +75,13 @@ class Crawler(object): except (urllib2.HTTPError, httplib.BadStatusLine): # We receive an exception in case of 404 self.add_to_deadlinks(url, found_via) - return + return None status = response.getcode() if status != None and status >= 400: self.add_to_deadlinks(url, found_via) + + return response def add_to_deadlinks(self, url, found_via): self.deadlinks.setdefault(found_via, [])