commit 45b79df501183a85956e0bf33d6319455fd5a321
parent 6f83c00ddd3ab4a45b07da87ace0a3492dbd80c9
Author: Stefan <stefan@eliteinformatiker.de>
Date: Tue, 29 Jan 2013 15:07:31 +0100
extracted frontier to own class
Diffstat:
M | crawler.py | | | 45 | +++++++++++---------------------------------- |
A | frontier.py | | | 63 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 74 insertions(+), 34 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -6,21 +6,15 @@ import heapq
import argparse
+import frontier
+
class Crawler(object):
def __init__(self, init_url):
init_domain = urlparse.urlparse(init_url).netloc
- # A list of urls that still have to be searched sorted by
- # domains,
- self.frontier = {}
- self.frontier[init_domain] = [(init_url, None)]
-
- # A list containing the next crawltimes on domain level,
- # to achieve a optimal throughput maintaining a polite policy
- self.crawltimes = [(time.time(), init_domain)]
-
- # Urls we have already visited
- self.found = set()
+ # Manages our domains we want to visit or have visited
+ self.frontier = frontier.Frontier()
+ self.frontier.add(init_url, None)
# List of deadlinks for each URL we have,
# i.e. url1: [deadlink1, deadlink2]
@@ -32,10 +26,6 @@ class Crawler(object):
# Timeout in seconds to wait, so that we do not kill our server
self._wait_time = 0
-
- # Timeout for waiting between each call to the same
- # domain twice, this determines how polite the crawler is
- self._polite_time = 1
@property
def restrict(self):
@@ -56,30 +46,22 @@ class Crawler(object):
@property
def polite_time(self):
- return self._polite_time
+ return self.frontier.polite_time
@polite_time.setter
def polite_time(self, seconds):
if seconds >= 0:
- self._polite_time = seconds
+ self.frontier.polite_time = seconds
def crawl(self):
while len(self.frontier) > 0:
time.sleep(self.wait_time)
- next_time, next_domain = heapq.heappop(self.crawltimes)
- next_url = self.frontier[next_domain].pop()
+ next_time, next_url = self.frontier.next()
while time.time() < next_time:
time.sleep(0.5)
- if len(self.frontier[next_domain]) > 0:
- next_crawl = time.time() + self.polite_time
- heapq.heappush(self.crawltimes,
- (next_crawl, next_domain))
- else:
- del(self.frontier[next_domain])
-
try:
self.visit_url(next_url[0], next_url[1])
except urllib2.URLError:
@@ -90,6 +72,8 @@ class Crawler(object):
def visit_url(self, url, found_via):
response = self.check_url(url, found_via)
+ self.frontier.notify_visit(url)
+
if response != None and not self.excluded(url):
self.collect_new_urls(url, response.read())
@@ -99,14 +83,7 @@ class Crawler(object):
try:
for page in self.extract_urls(html):
page = urlparse.urljoin(url, page)
- domain = urlparse.urlparse(page).netloc
-
- if not page in self.found:
- if not domain in self.frontier:
- self.frontier.setdefault(domain, [])
- heapq.heappush(self.crawltimes, (time.time(), domain))
- self.frontier[domain].append((page, url))
- self.found.add(page)
+ self.frontier.add(page, url)
except UnicodeEncodeError:
pass
diff --git a/frontier.py b/frontier.py
@@ -0,0 +1,63 @@
+import urlparse
+import time
+import heapq
+
+class Frontier(object):
+ def __init__(self):
+ # A list of urls that still have to be searched sorted by
+ # domains,
+ self.urls = {}
+
+ # A list containing the next crawltimes on domain level,
+ # to achieve a optimal throughput maintaining a polite policy
+ self.crawltimes = []
+
+ # Urls we have already found and in our set
+ self.found = set()
+
+ self._polite_time = 1
+
+ @property
+ def polite_time(self):
+ return self._polite_time
+
+ @polite_time.setter
+ def polite_time(self, seconds):
+ if seconds >= 0:
+ self._polite_time = seconds
+
+ def add(self, url, found_via):
+ if url in self.found:
+ return False
+
+ domain = urlparse.urlparse(url).netloc
+
+ # means this is the first URL in our set
+ if not domain in self.urls:
+ self.urls[domain] = []
+ heapq.heappush(self.crawltimes, (time.time(), domain))
+
+ self.urls[domain].append((url, found_via))
+ self.found.add(url)
+
+ return True
+
+ def next(self):
+ next_time, next_domain = heapq.heappop(self.crawltimes)
+
+ next_url = self.urls[next_domain].pop()
+
+ if len(self.urls[next_domain]) == 0:
+ del(self.urls[next_domain])
+
+ return next_time, next_url
+
+ def notify_visit(self, url):
+ domain = urlparse.urlparse(url).netloc
+
+ # If there are still other urls on this domain to crawl, add crawl time
+ if domain in self.urls:
+ heapq.heappush(self.crawltimes, (time.time() + self.polite_time, domain))
+
+ def __len__(self):
+ return sum([len(self.urls[domain]) for domain in self.urls])