commit 4c7a665a5cc210d0daa0c0ddff0a8182939b4f91
parent ad589588caefca329d4bbcd5778652f340a8e855
Author: Stefan <stefan@eliteinformatiker.de>
Date: Wed, 23 Jan 2013 13:49:28 +0100
added deadlink list and analysis
Diffstat:
M | crawler.py | | | 75 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- |
1 file changed, 58 insertions(+), 17 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -5,10 +5,22 @@ import time
class Crawler(object):
def __init__(self, init_url):
- self.pages = [init_url]
+ # A list of to be crawled urls, where the second element
+ # in the tuple is the URL via which we found this URL
+ self.pages = [(init_url, None)]
+
+ # Urls we have already visited
self.done = set()
+ # List of deadlinks for each URL we have,
+ # i.e. url1: [deadlink1, deadlink2]
+ self.deadlinks = {}
+
+ # Regular expression for URLs we are interested in (our internal
+ # URLs)
self.url_match = None
+
+ # Timeout in seconds to wait, so that we do not kill our server
self.wait_time = 0
def set_url_restrict(self, regexp):
@@ -22,26 +34,55 @@ class Crawler(object):
time.sleep(self.wait_time)
next_url = self.pages.pop()
- print next_url
try:
- html = self.visit_url(next_url)
- except:
+ self.visit_url(next_url)
+ except urllib2.URLError:
continue
finally:
self.done.add(next_url)
-
- try:
- for page in self.extract_urls(html):
- page = urlparse.urljoin(next_url, page)
-
- if not page in self.done and self.url_match.search(page):
- self.pages.append(page)
- except UnicodeEncodeError:
- pass
- def visit_url(self, url):
- return urllib2.urlopen(url)
+ def visit_url(self, url_tuple):
+ if self.url_match.search(url_tuple[0]):
+ self.visit_url_internal(url_tuple[0])
+ else:
+ self.visit_url_external(url_tuple[0], url_tuple[1])
+
+ def visit_url_internal(self, url):
+ print("Crawling internal: %s" % url)
+
+ html = urllib2.urlopen(url)
+
+ try:
+ for page in self.extract_urls(html):
+ page = urlparse.urljoin(url, page)
+
+ if not page in self.done:
+ self.pages.append((page, url))
+ except UnicodeEncodeError:
+ pass
+
+ def visit_url_external(self, url, found_via):
+ print("Trying external: %s" % url)
+
+ request = urllib2.Request(url)
+
+ try:
+ response = urllib2.urlopen(request)
+ except urllib2.HTTPError:
+ # We receive an exception in case of 404
+ self.add_to_deadlinks(url, found_via)
+ return
+
+ status = response.getcode()
+ if status != None and status >= 400:
+ self.add_to_deadlinks(url, found_via)
+
+ def add_to_deadlinks(self, url, found_via):
+ self.deadlinks.setdefault(found_via, [])
+ self.deadlinks[found_via].append(url)
+
+ print("Found new deadlink %s on %s" % (url, found_via))
def extract_urls(self, page):
soup = BeautifulSoup(page)
@@ -49,6 +90,6 @@ class Crawler(object):
c = Crawler("http://stefan-koch.name/")
-c.set_url_restrict("http://stefan-koch.name/.+")
-c.set_wait_time(5)
+c.set_url_restrict("http://stefan-koch.name/.*")
+c.set_wait_time(1)
c.crawl()