def __init__(self, log,
roots, exclude=None, strict=True, # What to crawl.
max_redirect=10, max_tries=4, # Per-url limits.
max_tasks=10, max_pool=10, # Global limits.
):
self.log = log
self.roots = roots
self.exclude = exclude
self.strict = strict
self.max_redirect = max_redirect
self.max_tries = max_tries
self.max_tasks = max_tasks
self.max_pool = max_pool
self.todo = {}
self.busy = {}
self.done = {}
self.pool = ConnectionPool(self.log, max_pool, max_tasks)
self.root_domains = set()
for root in roots:
parts = urllib.parse.urlparse(root)
host, port = urllib.parse.splitport(parts.netloc)
if not host:
continue
if re.match(r'\A[\d\.]*\Z', host):
self.root_domains.add(host)
else:
host = host.lower()
if self.strict:
self.root_domains.add(host)
if host.startswith('www.'):
self.root_domains.add(host[4:])
else:
self.root_domains.add('www.' + host)
else:
parts = host.split('.')
if len(parts) > 2:
host = '.'.join(parts[-2:])
self.root_domains.add(host)
for root in roots:
self.add_url(root)
self.governor = asyncio.locks.Semaphore(max_tasks)
self.termination = asyncio.locks.Condition()
self.t0 = time.time()
self.t1 = None