deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit a1612806241d715e7e92293e1f928a35ba62ba02
parent 7924e13a2029fe3faed9321bad84bb9b21629a0f
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Tue,  5 Feb 2013 14:41:38 +0100

added website class for later usage

Diffstat:
Mcrawler.py | 3++-
Mfrontier.py | 54++++++++++++++++++++++++++++++++++++------------------
2 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -72,7 +72,7 @@ class Crawler(object): response = self.check_url(url, found_via) self.frontier.notify_visit(url) - + if response != None and not self.excluded(url): self.collect_new_urls(url, response.read()) @@ -82,6 +82,7 @@ class Crawler(object): try: for page in self.extract_urls(html): page = urlparse.urljoin(url, page) + print("adding page %s" % page) self.frontier.add(page, url) except UnicodeEncodeError: pass diff --git a/frontier.py b/frontier.py @@ -4,13 +4,10 @@ import heapq class Frontier(object): def __init__(self): - # A list of urls that still have to be searched sorted by - # domains, - self.urls = {} - # A list containing the next crawltimes on domain level, # to achieve a optimal throughput maintaining a polite policy - self.crawltimes = [] + self.frontier = [] + self.websites = {} # Urls we have already found and in our set self.found = set() @@ -33,31 +30,52 @@ class Frontier(object): domain = urlparse.urlparse(url).netloc # means this is the first URL in our set - if not domain in self.urls: - self.urls[domain] = [] - heapq.heappush(self.crawltimes, (time.time(), domain)) + if domain in self.websites: + website = self.websites[domain] + else: + website = Website(domain) + heapq.heappush(self.frontier, (time.time(), website)) + self.websites[domain] = website - self.urls[domain].append((url, found_via)) + website.add_url(url, found_via) self.found.add(url) return True def next(self): - next_time, next_domain = heapq.heappop(self.crawltimes) - - next_url = self.urls[next_domain].pop() + next_time, next_domain = heapq.heappop(self.frontier) - if len(self.urls[next_domain]) == 0: - del(self.urls[next_domain]) + next_url = next_domain.next_url() return next_time, next_url def notify_visit(self, url): domain = urlparse.urlparse(url).netloc - + website = self.websites[domain] + # If there are still other urls on this domain to crawl, add crawl time - if domain in self.urls: - heapq.heappush(self.crawltimes, (time.time() + self.polite_time, domain)) + if len(website.urls) > 0: + heapq.heappush(self.frontier, (time.time() + self.polite_time, website)) + else: + del(self.websites[domain]) def __len__(self): - return sum([len(self.urls[domain]) for domain in self.urls]) + return sum([len(website.urls) for time, website + in self.frontier]) + + +class Website(object): + def __init__(self, domain): + self.domain = domain + self.urls = [] + self.robots = None + + def is_allowed(self, url): + # TODO + return True + + def add_url(self, url, found_via): + self.urls.append((url, found_via)) + + def next_url(self): + return self.urls.pop()