commit b9ffe8fc9b6df727e6f316522c2786c962942277
parent 72a3b889125ea2ee59c4220c72f82c3b554e7b5f
Author: Stefan <cct@stefan-koch.name>
Date: Wed, 23 Jan 2013 20:09:04 +0100
do not crawl external urls twice
Diffstat:
1 file changed, 3 insertions(+), 0 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -65,6 +65,9 @@ class Crawler(object):
pass
def visit_url_external(self, url, found_via):
+ if url in self.done:
+ return # no need to check for one url twice
+
print("Trying external: %s" % url)
request = urllib2.Request(url)