You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

675 lines
25 KiB

import os
import sys
import urllib.request, urllib.error, urllib.parse
import copy
import threading
import time
import math
import tempfile
import base64
import hashlib
import socket
import logging
from io import StringIO
import multiprocessing.dummy as multiprocessing
from ctypes import c_int
import json
import ssl
from . import utils
from .control_thread import ControlThread
from .download import download
__all__ = ['SmartDL', 'utils']
__version_mjaor__ = 1
__version_minor__ = 3
__version_micro__ = 4
__version__ = "{}.{}.{}".format(__version_mjaor__, __version_minor__, __version_micro__)
class HashFailedException(Exception):
"Raised when hash check fails."
def __init__(self, fn, calc_hash, needed_hash):
self.filename = fn
self.calculated_hash = calc_hash
self.needed_hash = needed_hash
def __str__(self):
return 'HashFailedException({}, got {}, expected {})'.format(self.filename, self.calculated_hash, self.needed_hash)
def __repr__(self):
return '<HashFailedException {}, got {}, expected {}>'.format(self.filename, self.calculated_hash, self.needed_hash)
class CanceledException(Exception):
"Raised when the job is canceled."
def __init__(self):
pass
def __str__(self):
return 'CanceledException'
def __repr__(self):
return "<CanceledException>"
class SmartDL:
'''
The main SmartDL class
:param urls: Download url. It is possible to pass unsafe and unicode characters. You can also pass a list of urls, and those will be used as mirrors.
:type urls: string or list of strings
:param dest: Destination path. Default is `%TEMP%/pySmartDL/`.
:type dest: string
:param progress_bar: If True, prints a progress bar to the `stdout stream <http://docs.python.org/2/library/sys.html#sys.stdout>`_. Default is `True`.
:type progress_bar: bool
:param fix_urls: If true, attempts to fix urls with unsafe characters.
:type fix_urls: bool
:param threads: Number of threads to use.
:type threads: int
:param timeout: Timeout for network operations, in seconds. Default is 5.
:type timeout: int
:param logger: An optional logger.
:type logger: `logging.Logger` instance
:param connect_default_logger: If true, connects a default logger to the class.
:type connect_default_logger: bool
:param request_args: Arguments to be passed to a new urllib.request.Request instance in dictionary form. See `urllib.request docs <https://docs.python.org/3/library/urllib.request.html#urllib.request.Request>`_ for options.
:type request_args: dict
:rtype: `SmartDL` instance
:param verify: If ssl certificates should be validated.
:type verify: bool
.. NOTE::
The provided dest may be a folder or a full path name (including filename). The workflow is:
* If the path exists, and it's an existing folder, the file will be downloaded to there with the original filename.
* If the past does not exist, it will create the folders, if needed, and refer to the last section of the path as the filename.
* If you want to download to folder that does not exist at the moment, and want the module to fill in the filename, make sure the path ends with `os.sep`.
* If no path is provided, `%TEMP%/pySmartDL/` will be used.
'''
def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, threads=5, timeout=5, logger=None, connect_default_logger=False, request_args=None, verify=True):
if logger:
self.logger = logger
elif connect_default_logger:
self.logger = utils.create_debugging_logger()
else:
self.logger = utils.DummyLogger()
if request_args:
if "headers" not in request_args:
request_args["headers"] = dict()
self.requestArgs = request_args
else:
self.requestArgs = {"headers": dict()}
if "User-Agent" not in self.requestArgs["headers"]:
self.requestArgs["headers"]["User-Agent"] = utils.get_random_useragent()
self.mirrors = [urls] if isinstance(urls, str) else urls
if fix_urls:
self.mirrors = [utils.url_fix(x) for x in self.mirrors]
self.url = self.mirrors.pop(0)
self.logger.info('Using url "{}"'.format(self.url))
fn = urllib.parse.unquote(os.path.basename(urllib.parse.urlparse(self.url).path))
self.dest = dest or os.path.join(tempfile.gettempdir(), 'pySmartDL', fn)
if self.dest[-1] == os.sep:
if os.path.exists(self.dest[:-1]) and os.path.isfile(self.dest[:-1]):
os.unlink(self.dest[:-1])
self.dest += fn
if os.path.isdir(self.dest):
self.dest = os.path.join(self.dest, fn)
self.progress_bar = progress_bar
self.threads_count = threads
self.timeout = timeout
self.current_attemp = 1
self.attemps_limit = 4
self.minChunkFile = 1024**2*2 # 2MB
self.filesize = 0
self.shared_var = multiprocessing.Value(c_int, 0) # a ctypes var that counts the bytes already downloaded
self.thread_shared_cmds = {}
self.status = "ready"
self.verify_hash = False
self._killed = False
self._failed = False
self._start_func_blocking = True
self.errors = []
self.post_threadpool_thread = None
self.control_thread = None
if not os.path.exists(os.path.dirname(self.dest)):
self.logger.info('Folder "{}" does not exist. Creating...'.format(os.path.dirname(self.dest)))
os.makedirs(os.path.dirname(self.dest))
if not utils.is_HTTPRange_supported(self.url, timeout=self.timeout):
self.logger.warning("Server does not support HTTPRange. threads_count is set to 1.")
self.threads_count = 1
if os.path.exists(self.dest):
self.logger.warning('Destination "{}" already exists. Existing file will be removed.'.format(self.dest))
if not os.path.exists(os.path.dirname(self.dest)):
self.logger.warning('Directory "{}" does not exist. Creating it...'.format(os.path.dirname(self.dest)))
os.makedirs(os.path.dirname(self.dest))
self.logger.info("Creating a ThreadPool of {} thread(s).".format(self.threads_count))
self.pool = utils.ManagedThreadPoolExecutor(self.threads_count)
if verify:
self.context = None
else:
self.context = ssl.create_default_context()
self.context.check_hostname = False
self.context.verify_mode = ssl.CERT_NONE
def __str__(self):
return 'SmartDL(r"{}", dest=r"{}")'.format(self.url, self.dest)
def __repr__(self):
return "<SmartDL {}>".format(self.url)
def add_basic_authentication(self, username, password):
'''
Uses HTTP Basic Access authentication for the connection.
:param username: Username.
:type username: string
:param password: Password.
:type password: string
'''
auth_string = '{}:{}'.format(username, password)
base64string = base64.standard_b64encode(auth_string.encode('utf-8'))
self.requestArgs['headers']['Authorization'] = b"Basic " + base64string
def add_hash_verification(self, algorithm, hash):
'''
Adds hash verification to the download.
If hash is not correct, will try different mirrors. If all mirrors aren't
passing hash verification, `HashFailedException` Exception will be raised.
.. NOTE::
If downloaded file already exist on the destination, and hash matches, pySmartDL will not download it again.
.. WARNING::
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_.
:param algorithm: Hashing algorithm.
:type algorithm: string
:param hash: Hash code.
:type hash: string
'''
self.verify_hash = True
self.hash_algorithm = algorithm
self.hash_code = hash
def fetch_hash_sums(self):
'''
Will attempt to fetch UNIX hash sums files (`SHA256SUMS`, `SHA1SUMS` or `MD5SUMS` files in
the same url directory).
Calls `self.add_hash_verification` if successful. Returns if a matching hash was found.
:rtype: bool
*New in 1.2.1*
'''
default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS']
folder = os.path.dirname(self.url)
orig_basename = os.path.basename(self.url)
self.logger.info("Looking for SUMS files...")
for filename in default_sums_filenames:
try:
sums_url = "%s/%s" % (folder, filename)
sumsRequest = urllib.request.Request(sums_url, **self.requestArgs)
obj = urllib.request.urlopen(sumsRequest)
data = obj.read().split('\n')
obj.close()
for line in data:
if orig_basename.lower() in line.lower():
self.logger.info("Found a matching hash in %s" % sums_url)
algo = filename.rstrip('SUMS')
hash = line.split(' ')[0]
self.add_hash_verification(algo, hash)
return
except urllib.error.HTTPError:
continue
def start(self, blocking=None):
'''
Starts the download task. Will raise `RuntimeError` if it's the object's already downloading.
.. warning::
If you're using the non-blocking mode, Exceptions won't be raised. In that case, call
`isSuccessful()` after the task is finished, to make sure the download succeeded. Call
`get_errors()` to get the the exceptions.
:param blocking: If true, calling this function will block the thread until the download finished. Default is *True*.
:type blocking: bool
'''
if not self.status == "ready":
raise RuntimeError("cannot start (current status is {})".format(self.status))
self.logger.info('Starting a new SmartDL operation.')
if blocking is None:
blocking = self._start_func_blocking
else:
self._start_func_blocking = blocking
if self.mirrors:
self.logger.info('One URL and {} mirrors are loaded.'.format(len(self.mirrors)))
else:
self.logger.info('One URL is loaded.')
if self.verify_hash and os.path.exists(self.dest):
if utils.get_file_hash(self.hash_algorithm, self.dest) == self.hash_code:
self.logger.info("Destination '%s' already exists, and the hash matches. No need to download." % self.dest)
self.status = 'finished'
return
self.logger.info("Downloading '{}' to '{}'...".format(self.url, self.dest))
req = urllib.request.Request(self.url, **self.requestArgs)
try:
urlObj = urllib.request.urlopen(req, timeout=self.timeout, context=self.context)
except (urllib.error.HTTPError, urllib.error.URLError, socket.timeout) as e:
self.errors.append(e)
if self.mirrors:
self.logger.info("{} Trying next mirror...".format(str(e)))
self.url = self.mirrors.pop(0)
self.logger.info('Using url "{}"'.format(self.url))
self.start(blocking)
return
else:
self.logger.warning(str(e))
self.errors.append(e)
self._failed = True
self.status = "finished"
raise
try:
self.filesize = int(urlObj.headers["Content-Length"])
self.logger.info("Content-Length is {} ({}).".format(self.filesize, utils.sizeof_human(self.filesize)))
except (IndexError, KeyError, TypeError):
self.logger.warning("Server did not send Content-Length. Filesize is unknown.")
self.filesize = 0
args = utils.calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile)
bytes_per_thread = args[0][1] - args[0][0] + 1
if len(args)>1:
self.logger.info("Launching {} threads (downloads {}/thread).".format(len(args), utils.sizeof_human(bytes_per_thread)))
else:
self.logger.info("Launching 1 thread (downloads {}).".format(utils.sizeof_human(bytes_per_thread)))
self.status = "downloading"
for i, arg in enumerate(args):
req = self.pool.submit(
download,
self.url,
self.dest+".%.3d" % i,
self.requestArgs,
self.context,
arg[0],
arg[1],
self.timeout,
self.shared_var,
self.thread_shared_cmds,
self.logger
)
self.post_threadpool_thread = threading.Thread(
target=post_threadpool_actions,
args=(
self.pool,
[[(self.dest+".%.3d" % i) for i in range(len(args))], self.dest],
self.filesize,
self
)
)
self.post_threadpool_thread.daemon = True
self.post_threadpool_thread.start()
self.control_thread = ControlThread(self)
if blocking:
self.wait(raise_exceptions=True)
def _exc_callback(self, req, e):
self.errors.append(e[0])
self.logger.exception(e[1])
def retry(self, eStr=""):
if self.current_attemp < self.attemps_limit:
self.current_attemp += 1
self.status = "ready"
self.shared_var.value = 0
self.thread_shared_cmds = {}
self.start()
else:
s = 'The maximum retry attempts reached'
if eStr:
s += " ({})".format(eStr)
self.errors.append(urllib.error.HTTPError(self.url, "0", s, {}, StringIO()))
self._failed = True
def try_next_mirror(self, e=None):
if self.mirrors:
if e:
self.errors.append(e)
self.status = "ready"
self.shared_var.value = 0
self.url = self.mirrors.pop(0)
self.logger.info('Using url "{}"'.format(self.url))
self.start()
else:
self._failed = True
self.errors.append(e)
def get_eta(self, human=False):
'''
Get estimated time of download completion, in seconds. Returns `0` if there is
no enough data to calculate the estimated time (this will happen on the approx.
first 5 seconds of each download).
:param human: If true, returns a human-readable formatted string. Else, returns an int type number
:type human: bool
:rtype: int/string
'''
if human:
s = utils.time_human(self.control_thread.get_eta())
return s if s else "TBD"
return self.control_thread.get_eta()
def get_speed(self, human=False):
'''
Get current transfer speed in bytes per second.
:param human: If true, returns a human-readable formatted string. Else, returns an int type number
:type human: bool
:rtype: int/string
'''
if human:
return "{}/s".format(utils.sizeof_human(self.control_thread.get_speed()))
return self.control_thread.get_speed()
def get_progress(self):
'''
Returns the current progress of the download, as a float between `0` and `1`.
:rtype: float
'''
if not self.filesize:
return 0
if self.control_thread.get_dl_size() <= self.filesize:
return 1.0*self.control_thread.get_dl_size()/self.filesize
return 1.0
def get_progress_bar(self, length=20):
'''
Returns the current progress of the download as a string containing a progress bar.
.. NOTE::
That's an alias for pySmartDL.utils.progress_bar(obj.get_progress()).
:param length: The length of the progress bar in chars. Default is 20.
:type length: int
:rtype: string
'''
return utils.progress_bar(self.get_progress(), length)
def isFinished(self):
'''
Returns if the task is finished.
:rtype: bool
'''
if self.status == "ready":
return False
if self.status == "finished":
return True
return not self.post_threadpool_thread.is_alive()
def isSuccessful(self):
'''
Returns if the download is successfull. It may fail in the following scenarios:
- Hash check is enabled and fails.
- All mirrors are down.
- Any local I/O problems (such as `no disk space available`).
.. NOTE::
Call `get_errors()` to get the exceptions, if any.
Will raise `RuntimeError` if it's called when the download task is not finished yet.
:rtype: bool
'''
if self._killed:
return False
n = 0
while self.status != 'finished':
n += 1
time.sleep(0.1)
if n >= 15:
raise RuntimeError("The download task must be finished in order to see if it's successful. (current status is {})".format(self.status))
return not self._failed
def get_errors(self):
'''
Get errors happened while downloading.
:rtype: list of `Exception` instances
'''
return self.errors
def get_status(self):
'''
Returns the current status of the task. Possible values: *ready*,
*downloading*, *paused*, *combining*, *finished*.
:rtype: string
'''
return self.status
def wait(self, raise_exceptions=False):
'''
Blocks until the download is finished.
:param raise_exceptions: If true, this function will raise exceptions. Default is *False*.
:type raise_exceptions: bool
'''
if self.status in ["ready", "finished"]:
return
while not self.isFinished():
time.sleep(0.1)
self.post_threadpool_thread.join()
self.control_thread.join()
if self._failed and raise_exceptions:
raise self.errors[-1]
def stop(self):
'''
Stops the download.
'''
if self.status == "downloading":
self.thread_shared_cmds['stop'] = ""
self._killed = True
def pause(self):
'''
Pauses the download.
'''
if self.status == "downloading":
self.status = "paused"
self.thread_shared_cmds['pause'] = ""
def resume(self):
'''
Continues the download. same as unpause().
'''
self.unpause()
def unpause(self):
'''
Continues the download. same as resume().
'''
if self.status == "paused" and 'pause' in self.thread_shared_cmds:
self.status = "downloading"
del self.thread_shared_cmds['pause']
def limit_speed(self, speed):
'''
Limits the download transfer speed.
:param speed: Speed in bytes per download per second. Negative values will not limit the speed. Default is `-1`.
:type speed: int
'''
if self.status == "downloading":
if speed == 0:
self.pause()
else:
self.unpause()
if speed > 0:
self.thread_shared_cmds['limit'] = speed/self.threads_count
elif 'limit' in self.thread_shared_cmds:
del self.thread_shared_cmds['limit']
def get_dest(self):
'''
Get the destination path of the downloaded file. Needed when no
destination is provided to the class, and exists on a temp folder.
:rtype: string
'''
return self.dest
def get_dl_time(self, human=False):
'''
Returns how much time did the download take, in seconds. Returns
`-1` if the download task is not finished yet.
:param human: If true, returns a human-readable formatted string. Else, returns an int type number
:type human: bool
:rtype: int/string
'''
if not self.control_thread:
return 0
if human:
return utils.time_human(self.control_thread.get_dl_time())
return self.control_thread.get_dl_time()
def get_dl_size(self, human=False):
'''
Get downloaded bytes counter in bytes.
:param human: If true, returns a human-readable formatted string. Else, returns an int type number
:type human: bool
:rtype: int/string
'''
if not self.control_thread:
return 0
if human:
return utils.sizeof_human(self.control_thread.get_dl_size())
return self.control_thread.get_dl_size()
def get_final_filesize(self, human=False):
'''
Get total download size in bytes.
:param human: If true, returns a human-readable formatted string. Else, returns an int type number
:type human: bool
:rtype: int/string
'''
if not self.control_thread:
return 0
if human:
return utils.sizeof_human(self.control_thread.get_final_filesize())
return self.control_thread.get_final_filesize()
def get_data(self, binary=False, bytes=-1):
'''
Returns the downloaded data. Will raise `RuntimeError` if it's
called when the download task is not finished yet.
:param binary: If true, will read the data as binary. Else, will read it as text.
:type binary: bool
:param bytes: Number of bytes to read. Negative values will read until EOF. Default is `-1`.
:type bytes: int
:rtype: string
'''
if self.status != 'finished':
raise RuntimeError("The download task must be finished in order to read the data. (current status is %s)" % self.status)
flags = 'rb' if binary else 'r'
with open(self.get_dest(), flags) as f:
data = f.read(bytes) if bytes>0 else f.read()
return data
def get_data_hash(self, algorithm):
'''
Returns the downloaded data's hash. Will raise `RuntimeError` if it's
called when the download task is not finished yet.
:param algorithm: Hashing algorithm.
:type algorithm: bool
:rtype: string
.. WARNING::
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_.
'''
return hashlib.new(algorithm, self.get_data(binary=True)).hexdigest()
def get_json(self):
'''
Returns the JSON in the downloaded data. Will raise `RuntimeError` if it's
called when the download task is not finished yet. Will raise `json.decoder.JSONDecodeError`
if the downloaded data is not valid JSON.
:rtype: dict
'''
data = self.get_data()
return json.loads(data)
def post_threadpool_actions(pool, args, expected_filesize, SmartDLObj):
"Run function after thread pool is done. Run this in a thread."
while not pool.done():
time.sleep(0.1)
if SmartDLObj._killed:
return
if pool.get_exception():
for exc in pool.get_exceptions():
SmartDLObj.logger.exception(exc)
SmartDLObj.retry(str(pool.get_exception()))
if SmartDLObj._failed:
SmartDLObj.logger.warning("Task had errors. Exiting...")
return
if expected_filesize: # if not zero, expected filesize is known
threads = len(args[0])
total_filesize = sum([os.path.getsize(x) for x in args[0]])
diff = math.fabs(expected_filesize - total_filesize)
# if the difference is more than 4*thread numbers (because a thread may download 4KB extra per thread because of NTFS's block size)
if diff > 4*1024*threads:
errMsg = 'Diff between downloaded files and expected filesizes is {}B (filesize: {}, expected_filesize: {}, {} threads).'.format(total_filesize, expected_filesize, diff, threads)
SmartDLObj.logger.warning(errMsg)
SmartDLObj.retry(errMsg)
return
SmartDLObj.status = "combining"
utils.combine_files(*args)
if SmartDLObj.verify_hash:
dest_path = args[-1]
hash_ = utils.get_file_hash(SmartDLObj.hash_algorithm, dest_path)
if hash_ == SmartDLObj.hash_code:
SmartDLObj.logger.info('Hash verification succeeded.')
else:
SmartDLObj.logger.warning('Hash verification failed.')
SmartDLObj.try_next_mirror(HashFailedException(os.path.basename(dest_path), hash, SmartDLObj.hash_code))