deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit 45b79df501183a85956e0bf33d6319455fd5a321
parent 6f83c00ddd3ab4a45b07da87ace0a3492dbd80c9
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Tue, 29 Jan 2013 15:07:31 +0100

extracted frontier to own class

Diffstat:
Mcrawler.py | 45+++++++++++----------------------------------
Afrontier.py | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -6,21 +6,15 @@ import heapq import argparse +import frontier + class Crawler(object): def __init__(self, init_url): init_domain = urlparse.urlparse(init_url).netloc - # A list of urls that still have to be searched sorted by - # domains, - self.frontier = {} - self.frontier[init_domain] = [(init_url, None)] - - # A list containing the next crawltimes on domain level, - # to achieve a optimal throughput maintaining a polite policy - self.crawltimes = [(time.time(), init_domain)] - - # Urls we have already visited - self.found = set() + # Manages our domains we want to visit or have visited + self.frontier = frontier.Frontier() + self.frontier.add(init_url, None) # List of deadlinks for each URL we have, # i.e. url1: [deadlink1, deadlink2] @@ -32,10 +26,6 @@ class Crawler(object): # Timeout in seconds to wait, so that we do not kill our server self._wait_time = 0 - - # Timeout for waiting between each call to the same - # domain twice, this determines how polite the crawler is - self._polite_time = 1 @property def restrict(self): @@ -56,30 +46,22 @@ class Crawler(object): @property def polite_time(self): - return self._polite_time + return self.frontier.polite_time @polite_time.setter def polite_time(self, seconds): if seconds >= 0: - self._polite_time = seconds + self.frontier.polite_time = seconds def crawl(self): while len(self.frontier) > 0: time.sleep(self.wait_time) - next_time, next_domain = heapq.heappop(self.crawltimes) - next_url = self.frontier[next_domain].pop() + next_time, next_url = self.frontier.next() while time.time() < next_time: time.sleep(0.5) - if len(self.frontier[next_domain]) > 0: - next_crawl = time.time() + self.polite_time - heapq.heappush(self.crawltimes, - (next_crawl, next_domain)) - else: - del(self.frontier[next_domain]) - try: self.visit_url(next_url[0], next_url[1]) except urllib2.URLError: @@ -90,6 +72,8 @@ class Crawler(object): def visit_url(self, url, found_via): response = self.check_url(url, found_via) + self.frontier.notify_visit(url) + if response != None and not self.excluded(url): self.collect_new_urls(url, response.read()) @@ -99,14 +83,7 @@ class Crawler(object): try: for page in self.extract_urls(html): page = urlparse.urljoin(url, page) - domain = urlparse.urlparse(page).netloc - - if not page in self.found: - if not domain in self.frontier: - self.frontier.setdefault(domain, []) - heapq.heappush(self.crawltimes, (time.time(), domain)) - self.frontier[domain].append((page, url)) - self.found.add(page) + self.frontier.add(page, url) except UnicodeEncodeError: pass diff --git a/frontier.py b/frontier.py @@ -0,0 +1,63 @@ +import urlparse +import time +import heapq + +class Frontier(object): + def __init__(self): + # A list of urls that still have to be searched sorted by + # domains, + self.urls = {} + + # A list containing the next crawltimes on domain level, + # to achieve a optimal throughput maintaining a polite policy + self.crawltimes = [] + + # Urls we have already found and in our set + self.found = set() + + self._polite_time = 1 + + @property + def polite_time(self): + return self._polite_time + + @polite_time.setter + def polite_time(self, seconds): + if seconds >= 0: + self._polite_time = seconds + + def add(self, url, found_via): + if url in self.found: + return False + + domain = urlparse.urlparse(url).netloc + + # means this is the first URL in our set + if not domain in self.urls: + self.urls[domain] = [] + heapq.heappush(self.crawltimes, (time.time(), domain)) + + self.urls[domain].append((url, found_via)) + self.found.add(url) + + return True + + def next(self): + next_time, next_domain = heapq.heappop(self.crawltimes) + + next_url = self.urls[next_domain].pop() + + if len(self.urls[next_domain]) == 0: + del(self.urls[next_domain]) + + return next_time, next_url + + def notify_visit(self, url): + domain = urlparse.urlparse(url).netloc + + # If there are still other urls on this domain to crawl, add crawl time + if domain in self.urls: + heapq.heappush(self.crawltimes, (time.time() + self.polite_time, domain)) + + def __len__(self): + return sum([len(self.urls[domain]) for domain in self.urls])