deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit c9c2c736eda17dd625a4a6438b10d9f90b270bda
parent b9ffe8fc9b6df727e6f316522c2786c962942277
Author: Stefan <cct@stefan-koch.name>
Date:   Wed, 23 Jan 2013 20:14:34 +0100

do not crawl external urls twice

Diffstat:
Mcrawler.py | 10+++-------
1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -12,7 +12,7 @@ class Crawler(object): self.pages = [(init_url, None)] # Urls we have already visited - self.done = set() + self.found = set() # List of deadlinks for each URL we have, # i.e. url1: [deadlink1, deadlink2] @@ -41,8 +41,6 @@ class Crawler(object): self.visit_url(next_url) except urllib2.URLError: continue - finally: - self.done.add(next_url) def visit_url(self, url_tuple): if self.url_match.search(url_tuple[0]): @@ -59,15 +57,13 @@ class Crawler(object): for page in self.extract_urls(html): page = urlparse.urljoin(url, page) - if not page in self.done: + if not page in self.found: self.pages.append((page, url)) + self.found.add(page) except UnicodeEncodeError: pass def visit_url_external(self, url, found_via): - if url in self.done: - return # no need to check for one url twice - print("Trying external: %s" % url) request = urllib2.Request(url)