commit 6f83c00ddd3ab4a45b07da87ace0a3492dbd80c9
parent ec4b6107255053ec1c9e88cdf79bfdad4e5508cd
Author: Stefan <stefan@eliteinformatiker.de>
Date: Mon, 28 Jan 2013 13:48:31 +0100
added python properties and cli politeness
Diffstat:
M | crawler.py | | | 85 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------ |
1 file changed, 66 insertions(+), 19 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -1,18 +1,23 @@
from BeautifulSoup import BeautifulSoup
-import urllib2, urlparse
+import urllib2, httplib, urlparse
import re
import time
-import httplib
+import heapq
import argparse
-# TODO: Do not apply wait time to external links
-
class Crawler(object):
def __init__(self, init_url):
- # A list of to be crawled urls, where the second element
- # in the tuple is the URL via which we found this URL
- self.pages = [(init_url, None)]
+ init_domain = urlparse.urlparse(init_url).netloc
+
+ # A list of urls that still have to be searched sorted by
+ # domains,
+ self.frontier = {}
+ self.frontier[init_domain] = [(init_url, None)]
+
+ # A list containing the next crawltimes on domain level,
+ # to achieve a optimal throughput maintaining a polite policy
+ self.crawltimes = [(time.time(), init_domain)]
# Urls we have already visited
self.found = set()
@@ -23,22 +28,57 @@ class Crawler(object):
# Regular expression for URLs we are interested in (our internal
# URLs)
- self.url_match = None
+ self._url_match = None
# Timeout in seconds to wait, so that we do not kill our server
- self.wait_time = 0
+ self._wait_time = 0
+
+ # Timeout for waiting between each call to the same
+ # domain twice, this determines how polite the crawler is
+ self._polite_time = 1
+
+ @property
+ def restrict(self):
+ return self._url_match
- def set_url_restrict(self, regexp):
- self.url_match = re.compile(regexp)
+ @restrict.setter
+ def restrict(self, url_match):
+ self._url_match = re.compile(url_match)
- def set_wait_time(self, seconds):
- self.wait_time = seconds
+ @property
+ def wait_time(self):
+ return self._wait_time
+
+ @wait_time.setter
+ def wait_time(self, seconds):
+ if seconds >= 0:
+ self._wait_time = seconds
+
+ @property
+ def polite_time(self):
+ return self._polite_time
+
+ @polite_time.setter
+ def polite_time(self, seconds):
+ if seconds >= 0:
+ self._polite_time = seconds
def crawl(self):
- while len(self.pages) > 0:
+ while len(self.frontier) > 0:
time.sleep(self.wait_time)
- next_url = self.pages.pop()
+ next_time, next_domain = heapq.heappop(self.crawltimes)
+ next_url = self.frontier[next_domain].pop()
+
+ while time.time() < next_time:
+ time.sleep(0.5)
+
+ if len(self.frontier[next_domain]) > 0:
+ next_crawl = time.time() + self.polite_time
+ heapq.heappush(self.crawltimes,
+ (next_crawl, next_domain))
+ else:
+ del(self.frontier[next_domain])
try:
self.visit_url(next_url[0], next_url[1])
@@ -59,9 +99,13 @@ class Crawler(object):
try:
for page in self.extract_urls(html):
page = urlparse.urljoin(url, page)
+ domain = urlparse.urlparse(page).netloc
if not page in self.found:
- self.pages.append((page, url))
+ if not domain in self.frontier:
+ self.frontier.setdefault(domain, [])
+ heapq.heappush(self.crawltimes, (time.time(), domain))
+ self.frontier[domain].append((page, url))
self.found.add(page)
except UnicodeEncodeError:
pass
@@ -95,7 +139,7 @@ class Crawler(object):
return [link.get('href') for link in soup.findAll('a')]
def excluded(self, url):
- return self.url_match != None and not self.url_match.search(url)
+ return self._url_match != None and not self._url_match.search(url)
def print_deadlinks(self, deadlinks):
if len(deadlinks) == 0:
@@ -114,12 +158,15 @@ if __name__ == "__main__":
parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch")
+ parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain")
args = parser.parse_args()
c = Crawler(args.url)
if args.restrict:
- c.set_url_restrict(args.restrict)
+ c.restrict = args.restrict
if args.wait_time:
- c.set_wait_time(args.wait_time)
+ c.wait_time = args.wait_time
+ if args.polite_time:
+ c.polite_time = args.polite_time
c.crawl()