import os import sys import urllib.request, urllib.error, urllib.parse import copy import threading import time import math import tempfile import base64 import hashlib import socket import logging from io import StringIO import multiprocessing.dummy as multiprocessing from ctypes import c_int import json import ssl from . import utils from .control_thread import ControlThread from .download import download __all__ = ['SmartDL', 'utils'] __version_mjaor__ = 1 __version_minor__ = 3 __version_micro__ = 4 __version__ = "{}.{}.{}".format(__version_mjaor__, __version_minor__, __version_micro__) class HashFailedException(Exception): "Raised when hash check fails." def __init__(self, fn, calc_hash, needed_hash): self.filename = fn self.calculated_hash = calc_hash self.needed_hash = needed_hash def __str__(self): return 'HashFailedException({}, got {}, expected {})'.format(self.filename, self.calculated_hash, self.needed_hash) def __repr__(self): return ''.format(self.filename, self.calculated_hash, self.needed_hash) class CanceledException(Exception): "Raised when the job is canceled." def __init__(self): pass def __str__(self): return 'CanceledException' def __repr__(self): return "" class SmartDL: ''' The main SmartDL class :param urls: Download url. It is possible to pass unsafe and unicode characters. You can also pass a list of urls, and those will be used as mirrors. :type urls: string or list of strings :param dest: Destination path. Default is `%TEMP%/pySmartDL/`. :type dest: string :param progress_bar: If True, prints a progress bar to the `stdout stream `_. Default is `True`. :type progress_bar: bool :param fix_urls: If true, attempts to fix urls with unsafe characters. :type fix_urls: bool :param threads: Number of threads to use. :type threads: int :param timeout: Timeout for network operations, in seconds. Default is 5. :type timeout: int :param logger: An optional logger. :type logger: `logging.Logger` instance :param connect_default_logger: If true, connects a default logger to the class. :type connect_default_logger: bool :param request_args: Arguments to be passed to a new urllib.request.Request instance in dictionary form. See `urllib.request docs `_ for options. :type request_args: dict :rtype: `SmartDL` instance :param verify: If ssl certificates should be validated. :type verify: bool .. NOTE:: The provided dest may be a folder or a full path name (including filename). The workflow is: * If the path exists, and it's an existing folder, the file will be downloaded to there with the original filename. * If the past does not exist, it will create the folders, if needed, and refer to the last section of the path as the filename. * If you want to download to folder that does not exist at the moment, and want the module to fill in the filename, make sure the path ends with `os.sep`. * If no path is provided, `%TEMP%/pySmartDL/` will be used. ''' def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, threads=5, timeout=5, logger=None, connect_default_logger=False, request_args=None, verify=True): if logger: self.logger = logger elif connect_default_logger: self.logger = utils.create_debugging_logger() else: self.logger = utils.DummyLogger() if request_args: if "headers" not in request_args: request_args["headers"] = dict() self.requestArgs = request_args else: self.requestArgs = {"headers": dict()} if "User-Agent" not in self.requestArgs["headers"]: self.requestArgs["headers"]["User-Agent"] = utils.get_random_useragent() self.mirrors = [urls] if isinstance(urls, str) else urls if fix_urls: self.mirrors = [utils.url_fix(x) for x in self.mirrors] self.url = self.mirrors.pop(0) self.logger.info('Using url "{}"'.format(self.url)) fn = urllib.parse.unquote(os.path.basename(urllib.parse.urlparse(self.url).path)) self.dest = dest or os.path.join(tempfile.gettempdir(), 'pySmartDL', fn) if self.dest[-1] == os.sep: if os.path.exists(self.dest[:-1]) and os.path.isfile(self.dest[:-1]): os.unlink(self.dest[:-1]) self.dest += fn if os.path.isdir(self.dest): self.dest = os.path.join(self.dest, fn) self.progress_bar = progress_bar self.threads_count = threads self.timeout = timeout self.current_attemp = 1 self.attemps_limit = 4 self.minChunkFile = 1024**2*2 # 2MB self.filesize = 0 self.shared_var = multiprocessing.Value(c_int, 0) # a ctypes var that counts the bytes already downloaded self.thread_shared_cmds = {} self.status = "ready" self.verify_hash = False self._killed = False self._failed = False self._start_func_blocking = True self.errors = [] self.post_threadpool_thread = None self.control_thread = None if not os.path.exists(os.path.dirname(self.dest)): self.logger.info('Folder "{}" does not exist. Creating...'.format(os.path.dirname(self.dest))) os.makedirs(os.path.dirname(self.dest)) if not utils.is_HTTPRange_supported(self.url, timeout=self.timeout): self.logger.warning("Server does not support HTTPRange. threads_count is set to 1.") self.threads_count = 1 if os.path.exists(self.dest): self.logger.warning('Destination "{}" already exists. Existing file will be removed.'.format(self.dest)) if not os.path.exists(os.path.dirname(self.dest)): self.logger.warning('Directory "{}" does not exist. Creating it...'.format(os.path.dirname(self.dest))) os.makedirs(os.path.dirname(self.dest)) self.logger.info("Creating a ThreadPool of {} thread(s).".format(self.threads_count)) self.pool = utils.ManagedThreadPoolExecutor(self.threads_count) if verify: self.context = None else: self.context = ssl.create_default_context() self.context.check_hostname = False self.context.verify_mode = ssl.CERT_NONE def __str__(self): return 'SmartDL(r"{}", dest=r"{}")'.format(self.url, self.dest) def __repr__(self): return "".format(self.url) def add_basic_authentication(self, username, password): ''' Uses HTTP Basic Access authentication for the connection. :param username: Username. :type username: string :param password: Password. :type password: string ''' auth_string = '{}:{}'.format(username, password) base64string = base64.standard_b64encode(auth_string.encode('utf-8')) self.requestArgs['headers']['Authorization'] = b"Basic " + base64string def add_hash_verification(self, algorithm, hash): ''' Adds hash verification to the download. If hash is not correct, will try different mirrors. If all mirrors aren't passing hash verification, `HashFailedException` Exception will be raised. .. NOTE:: If downloaded file already exist on the destination, and hash matches, pySmartDL will not download it again. .. WARNING:: The hashing algorithm must be supported on your system, as documented at `hashlib documentation page `_. :param algorithm: Hashing algorithm. :type algorithm: string :param hash: Hash code. :type hash: string ''' self.verify_hash = True self.hash_algorithm = algorithm self.hash_code = hash def fetch_hash_sums(self): ''' Will attempt to fetch UNIX hash sums files (`SHA256SUMS`, `SHA1SUMS` or `MD5SUMS` files in the same url directory). Calls `self.add_hash_verification` if successful. Returns if a matching hash was found. :rtype: bool *New in 1.2.1* ''' default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS'] folder = os.path.dirname(self.url) orig_basename = os.path.basename(self.url) self.logger.info("Looking for SUMS files...") for filename in default_sums_filenames: try: sums_url = "%s/%s" % (folder, filename) sumsRequest = urllib.request.Request(sums_url, **self.requestArgs) obj = urllib.request.urlopen(sumsRequest) data = obj.read().split('\n') obj.close() for line in data: if orig_basename.lower() in line.lower(): self.logger.info("Found a matching hash in %s" % sums_url) algo = filename.rstrip('SUMS') hash = line.split(' ')[0] self.add_hash_verification(algo, hash) return except urllib.error.HTTPError: continue def start(self, blocking=None): ''' Starts the download task. Will raise `RuntimeError` if it's the object's already downloading. .. warning:: If you're using the non-blocking mode, Exceptions won't be raised. In that case, call `isSuccessful()` after the task is finished, to make sure the download succeeded. Call `get_errors()` to get the the exceptions. :param blocking: If true, calling this function will block the thread until the download finished. Default is *True*. :type blocking: bool ''' if not self.status == "ready": raise RuntimeError("cannot start (current status is {})".format(self.status)) self.logger.info('Starting a new SmartDL operation.') if blocking is None: blocking = self._start_func_blocking else: self._start_func_blocking = blocking if self.mirrors: self.logger.info('One URL and {} mirrors are loaded.'.format(len(self.mirrors))) else: self.logger.info('One URL is loaded.') if self.verify_hash and os.path.exists(self.dest): if utils.get_file_hash(self.hash_algorithm, self.dest) == self.hash_code: self.logger.info("Destination '%s' already exists, and the hash matches. No need to download." % self.dest) self.status = 'finished' return self.logger.info("Downloading '{}' to '{}'...".format(self.url, self.dest)) req = urllib.request.Request(self.url, **self.requestArgs) try: urlObj = urllib.request.urlopen(req, timeout=self.timeout, context=self.context) except (urllib.error.HTTPError, urllib.error.URLError, socket.timeout) as e: self.errors.append(e) if self.mirrors: self.logger.info("{} Trying next mirror...".format(str(e))) self.url = self.mirrors.pop(0) self.logger.info('Using url "{}"'.format(self.url)) self.start(blocking) return else: self.logger.warning(str(e)) self.errors.append(e) self._failed = True self.status = "finished" raise try: self.filesize = int(urlObj.headers["Content-Length"]) self.logger.info("Content-Length is {} ({}).".format(self.filesize, utils.sizeof_human(self.filesize))) except (IndexError, KeyError, TypeError): self.logger.warning("Server did not send Content-Length. Filesize is unknown.") self.filesize = 0 args = utils.calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile) bytes_per_thread = args[0][1] - args[0][0] + 1 if len(args)>1: self.logger.info("Launching {} threads (downloads {}/thread).".format(len(args), utils.sizeof_human(bytes_per_thread))) else: self.logger.info("Launching 1 thread (downloads {}).".format(utils.sizeof_human(bytes_per_thread))) self.status = "downloading" for i, arg in enumerate(args): req = self.pool.submit( download, self.url, self.dest+".%.3d" % i, self.requestArgs, self.context, arg[0], arg[1], self.timeout, self.shared_var, self.thread_shared_cmds, self.logger ) self.post_threadpool_thread = threading.Thread( target=post_threadpool_actions, args=( self.pool, [[(self.dest+".%.3d" % i) for i in range(len(args))], self.dest], self.filesize, self ) ) self.post_threadpool_thread.daemon = True self.post_threadpool_thread.start() self.control_thread = ControlThread(self) if blocking: self.wait(raise_exceptions=True) def _exc_callback(self, req, e): self.errors.append(e[0]) self.logger.exception(e[1]) def retry(self, eStr=""): if self.current_attemp < self.attemps_limit: self.current_attemp += 1 self.status = "ready" self.shared_var.value = 0 self.thread_shared_cmds = {} self.start() else: s = 'The maximum retry attempts reached' if eStr: s += " ({})".format(eStr) self.errors.append(urllib.error.HTTPError(self.url, "0", s, {}, StringIO())) self._failed = True def try_next_mirror(self, e=None): if self.mirrors: if e: self.errors.append(e) self.status = "ready" self.shared_var.value = 0 self.url = self.mirrors.pop(0) self.logger.info('Using url "{}"'.format(self.url)) self.start() else: self._failed = True self.errors.append(e) def get_eta(self, human=False): ''' Get estimated time of download completion, in seconds. Returns `0` if there is no enough data to calculate the estimated time (this will happen on the approx. first 5 seconds of each download). :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if human: s = utils.time_human(self.control_thread.get_eta()) return s if s else "TBD" return self.control_thread.get_eta() def get_speed(self, human=False): ''' Get current transfer speed in bytes per second. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if human: return "{}/s".format(utils.sizeof_human(self.control_thread.get_speed())) return self.control_thread.get_speed() def get_progress(self): ''' Returns the current progress of the download, as a float between `0` and `1`. :rtype: float ''' if not self.filesize: return 0 if self.control_thread.get_dl_size() <= self.filesize: return 1.0*self.control_thread.get_dl_size()/self.filesize return 1.0 def get_progress_bar(self, length=20): ''' Returns the current progress of the download as a string containing a progress bar. .. NOTE:: That's an alias for pySmartDL.utils.progress_bar(obj.get_progress()). :param length: The length of the progress bar in chars. Default is 20. :type length: int :rtype: string ''' return utils.progress_bar(self.get_progress(), length) def isFinished(self): ''' Returns if the task is finished. :rtype: bool ''' if self.status == "ready": return False if self.status == "finished": return True return not self.post_threadpool_thread.is_alive() def isSuccessful(self): ''' Returns if the download is successfull. It may fail in the following scenarios: - Hash check is enabled and fails. - All mirrors are down. - Any local I/O problems (such as `no disk space available`). .. NOTE:: Call `get_errors()` to get the exceptions, if any. Will raise `RuntimeError` if it's called when the download task is not finished yet. :rtype: bool ''' if self._killed: return False n = 0 while self.status != 'finished': n += 1 time.sleep(0.1) if n >= 15: raise RuntimeError("The download task must be finished in order to see if it's successful. (current status is {})".format(self.status)) return not self._failed def get_errors(self): ''' Get errors happened while downloading. :rtype: list of `Exception` instances ''' return self.errors def get_status(self): ''' Returns the current status of the task. Possible values: *ready*, *downloading*, *paused*, *combining*, *finished*. :rtype: string ''' return self.status def wait(self, raise_exceptions=False): ''' Blocks until the download is finished. :param raise_exceptions: If true, this function will raise exceptions. Default is *False*. :type raise_exceptions: bool ''' if self.status in ["ready", "finished"]: return while not self.isFinished(): time.sleep(0.1) self.post_threadpool_thread.join() self.control_thread.join() if self._failed and raise_exceptions: raise self.errors[-1] def stop(self): ''' Stops the download. ''' if self.status == "downloading": self.thread_shared_cmds['stop'] = "" self._killed = True def pause(self): ''' Pauses the download. ''' if self.status == "downloading": self.status = "paused" self.thread_shared_cmds['pause'] = "" def resume(self): ''' Continues the download. same as unpause(). ''' self.unpause() def unpause(self): ''' Continues the download. same as resume(). ''' if self.status == "paused" and 'pause' in self.thread_shared_cmds: self.status = "downloading" del self.thread_shared_cmds['pause'] def limit_speed(self, speed): ''' Limits the download transfer speed. :param speed: Speed in bytes per download per second. Negative values will not limit the speed. Default is `-1`. :type speed: int ''' if self.status == "downloading": if speed == 0: self.pause() else: self.unpause() if speed > 0: self.thread_shared_cmds['limit'] = speed/self.threads_count elif 'limit' in self.thread_shared_cmds: del self.thread_shared_cmds['limit'] def get_dest(self): ''' Get the destination path of the downloaded file. Needed when no destination is provided to the class, and exists on a temp folder. :rtype: string ''' return self.dest def get_dl_time(self, human=False): ''' Returns how much time did the download take, in seconds. Returns `-1` if the download task is not finished yet. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if not self.control_thread: return 0 if human: return utils.time_human(self.control_thread.get_dl_time()) return self.control_thread.get_dl_time() def get_dl_size(self, human=False): ''' Get downloaded bytes counter in bytes. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if not self.control_thread: return 0 if human: return utils.sizeof_human(self.control_thread.get_dl_size()) return self.control_thread.get_dl_size() def get_final_filesize(self, human=False): ''' Get total download size in bytes. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if not self.control_thread: return 0 if human: return utils.sizeof_human(self.control_thread.get_final_filesize()) return self.control_thread.get_final_filesize() def get_data(self, binary=False, bytes=-1): ''' Returns the downloaded data. Will raise `RuntimeError` if it's called when the download task is not finished yet. :param binary: If true, will read the data as binary. Else, will read it as text. :type binary: bool :param bytes: Number of bytes to read. Negative values will read until EOF. Default is `-1`. :type bytes: int :rtype: string ''' if self.status != 'finished': raise RuntimeError("The download task must be finished in order to read the data. (current status is %s)" % self.status) flags = 'rb' if binary else 'r' with open(self.get_dest(), flags) as f: data = f.read(bytes) if bytes>0 else f.read() return data def get_data_hash(self, algorithm): ''' Returns the downloaded data's hash. Will raise `RuntimeError` if it's called when the download task is not finished yet. :param algorithm: Hashing algorithm. :type algorithm: bool :rtype: string .. WARNING:: The hashing algorithm must be supported on your system, as documented at `hashlib documentation page `_. ''' return hashlib.new(algorithm, self.get_data(binary=True)).hexdigest() def get_json(self): ''' Returns the JSON in the downloaded data. Will raise `RuntimeError` if it's called when the download task is not finished yet. Will raise `json.decoder.JSONDecodeError` if the downloaded data is not valid JSON. :rtype: dict ''' data = self.get_data() return json.loads(data) def post_threadpool_actions(pool, args, expected_filesize, SmartDLObj): "Run function after thread pool is done. Run this in a thread." while not pool.done(): time.sleep(0.1) if SmartDLObj._killed: return if pool.get_exception(): for exc in pool.get_exceptions(): SmartDLObj.logger.exception(exc) SmartDLObj.retry(str(pool.get_exception())) if SmartDLObj._failed: SmartDLObj.logger.warning("Task had errors. Exiting...") return if expected_filesize: # if not zero, expected filesize is known threads = len(args[0]) total_filesize = sum([os.path.getsize(x) for x in args[0]]) diff = math.fabs(expected_filesize - total_filesize) # if the difference is more than 4*thread numbers (because a thread may download 4KB extra per thread because of NTFS's block size) if diff > 4*1024*threads: errMsg = 'Diff between downloaded files and expected filesizes is {}B (filesize: {}, expected_filesize: {}, {} threads).'.format(total_filesize, expected_filesize, diff, threads) SmartDLObj.logger.warning(errMsg) SmartDLObj.retry(errMsg) return SmartDLObj.status = "combining" utils.combine_files(*args) if SmartDLObj.verify_hash: dest_path = args[-1] hash_ = utils.get_file_hash(SmartDLObj.hash_algorithm, dest_path) if hash_ == SmartDLObj.hash_code: SmartDLObj.logger.info('Hash verification succeeded.') else: SmartDLObj.logger.warning('Hash verification failed.') SmartDLObj.try_next_mirror(HashFailedException(os.path.basename(dest_path), hash, SmartDLObj.hash_code))