Patroneo/template/pySmartDL/pySmartDL.py

import os
import sys
import urllib.request, urllib.error, urllib.parse
import copy
import threading
import time
import math
import tempfile
import base64
import hashlib
import socket
import logging
from io import StringIO
import multiprocessing.dummy as multiprocessing
from ctypes import c_int
import json
import ssl

from . import utils
from .control_thread import ControlThread
from .download import download

__all__ = ['SmartDL', 'utils']
__version_mjaor__ = 1
__version_minor__ = 3
__version_micro__ = 4
__version__ = "{}.{}.{}".format(__version_mjaor__, __version_minor__, __version_micro__)

class HashFailedException(Exception):
    "Raised when hash check fails."
    def __init__(self, fn, calc_hash, needed_hash):
        self.filename = fn
        self.calculated_hash = calc_hash
        self.needed_hash = needed_hash
    def __str__(self):
        return 'HashFailedException({}, got {}, expected {})'.format(self.filename, self.calculated_hash, self.needed_hash)
    def __repr__(self):
        return '<HashFailedException {}, got {}, expected {}>'.format(self.filename, self.calculated_hash, self.needed_hash)
        
class CanceledException(Exception):
    "Raised when the job is canceled."
    def __init__(self):
        pass
    def __str__(self):
        return 'CanceledException'
    def __repr__(self):
        return "<CanceledException>"

class SmartDL:
    '''
    The main SmartDL class
    
    :param urls: Download url. It is possible to pass unsafe and unicode characters. You can also pass a list of urls, and those will be used as mirrors.
    :type urls: string or list of strings
    :param dest: Destination path. Default is `%TEMP%/pySmartDL/`.
    :type dest: string
    :param progress_bar: If True, prints a progress bar to the `stdout stream <http://docs.python.org/2/library/sys.html#sys.stdout>`_. Default is `True`.
    :type progress_bar: bool
	:param fix_urls: If true, attempts to fix urls with unsafe characters.
	:type fix_urls: bool
	:param threads: Number of threads to use.
	:type threads: int
    :param timeout: Timeout for network operations, in seconds. Default is 5.
	:type timeout: int
    :param logger: An optional logger.
    :type logger: `logging.Logger` instance
    :param connect_default_logger: If true, connects a default logger to the class.
    :type connect_default_logger: bool
    :param request_args: Arguments to be passed to a new urllib.request.Request instance in dictionary form. See `urllib.request docs <https://docs.python.org/3/library/urllib.request.html#urllib.request.Request>`_ for options. 
    :type request_args: dict
    :rtype: `SmartDL` instance
    :param verify: If ssl certificates should be validated.
    :type verify: bool
    
    .. NOTE::
            The provided dest may be a folder or a full path name (including filename). The workflow is:
            
            * If the path exists, and it's an existing folder, the file will be downloaded to there with the original filename.
            * If the past does not exist, it will create the folders, if needed, and refer to the last section of the path as the filename.
            * If you want to download to folder that does not exist at the moment, and want the module to fill in the filename, make sure the path ends with `os.sep`.
            * If no path is provided, `%TEMP%/pySmartDL/` will be used.
    '''
    
    def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, threads=5, timeout=5, logger=None, connect_default_logger=False, request_args=None, verify=True):
        if logger:
            self.logger = logger
        elif connect_default_logger:
            self.logger = utils.create_debugging_logger()
        else:
            self.logger = utils.DummyLogger()
        if request_args:
            if "headers" not in request_args:
                request_args["headers"] = dict()
            self.requestArgs = request_args
        else:
            self.requestArgs = {"headers": dict()}
        if "User-Agent" not in self.requestArgs["headers"]:
            self.requestArgs["headers"]["User-Agent"] = utils.get_random_useragent()
        self.mirrors = [urls] if isinstance(urls, str) else urls
        if fix_urls:
            self.mirrors = [utils.url_fix(x) for x in self.mirrors]
        self.url = self.mirrors.pop(0)
        self.logger.info('Using url "{}"'.format(self.url))

        fn = urllib.parse.unquote(os.path.basename(urllib.parse.urlparse(self.url).path))
        self.dest = dest or os.path.join(tempfile.gettempdir(), 'pySmartDL', fn)
        if self.dest[-1] == os.sep:
            if os.path.exists(self.dest[:-1]) and os.path.isfile(self.dest[:-1]):
                os.unlink(self.dest[:-1])
            self.dest += fn
        if os.path.isdir(self.dest):
            self.dest = os.path.join(self.dest, fn)
        
        self.progress_bar = progress_bar
        self.threads_count = threads
        self.timeout = timeout
        self.current_attemp = 1 
        self.attemps_limit = 4
        self.minChunkFile = 1024**2*2 # 2MB
        self.filesize = 0
        self.shared_var = multiprocessing.Value(c_int, 0)  # a ctypes var that counts the bytes already downloaded
        self.thread_shared_cmds = {}
        self.status = "ready"
        self.verify_hash = False
        self._killed = False
        self._failed = False
        self._start_func_blocking = True
        self.errors = []
        
        self.post_threadpool_thread = None
        self.control_thread = None
        
        if not os.path.exists(os.path.dirname(self.dest)):
            self.logger.info('Folder "{}" does not exist. Creating...'.format(os.path.dirname(self.dest)))
            os.makedirs(os.path.dirname(self.dest))
        if not utils.is_HTTPRange_supported(self.url, timeout=self.timeout):
            self.logger.warning("Server does not support HTTPRange. threads_count is set to 1.")
            self.threads_count = 1
        if os.path.exists(self.dest):
            self.logger.warning('Destination "{}" already exists. Existing file will be removed.'.format(self.dest))
        if not os.path.exists(os.path.dirname(self.dest)):
            self.logger.warning('Directory "{}" does not exist. Creating it...'.format(os.path.dirname(self.dest)))
            os.makedirs(os.path.dirname(self.dest))
        
        self.logger.info("Creating a ThreadPool of {} thread(s).".format(self.threads_count))
        self.pool = utils.ManagedThreadPoolExecutor(self.threads_count)

        if verify:
            self.context = None
        else:
            self.context = ssl.create_default_context()
            self.context.check_hostname = False
            self.context.verify_mode = ssl.CERT_NONE
        
    def __str__(self):
        return 'SmartDL(r"{}", dest=r"{}")'.format(self.url, self.dest)

    def __repr__(self):
        return "<SmartDL {}>".format(self.url)
        
    def add_basic_authentication(self, username, password):
        '''
        Uses HTTP Basic Access authentication for the connection.
        
        :param username: Username.
        :type username: string
        :param password: Password.
        :type password: string
        '''
        auth_string = '{}:{}'.format(username, password)
        base64string = base64.standard_b64encode(auth_string.encode('utf-8'))
        self.requestArgs['headers']['Authorization'] = b"Basic " + base64string
        
    def add_hash_verification(self, algorithm, hash):
        '''
        Adds hash verification to the download.
        
        If hash is not correct, will try different mirrors. If all mirrors aren't
        passing hash verification, `HashFailedException` Exception will be raised.
        
        .. NOTE::
            If downloaded file already exist on the destination, and hash matches, pySmartDL will not download it again.
            
        .. WARNING::
            The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_.
        
        :param algorithm: Hashing algorithm.
        :type algorithm: string
        :param hash: Hash code.
        :type hash: string
        '''
        
        self.verify_hash = True
        self.hash_algorithm = algorithm
        self.hash_code = hash
        
    def fetch_hash_sums(self):
        '''
        Will attempt to fetch UNIX hash sums files (`SHA256SUMS`, `SHA1SUMS` or `MD5SUMS` files in
        the same url directory).
        
        Calls `self.add_hash_verification` if successful. Returns if a matching hash was found.
        
        :rtype: bool
        
        *New in 1.2.1*
        '''
        default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS']
        folder = os.path.dirname(self.url)
        orig_basename = os.path.basename(self.url)
        
        self.logger.info("Looking for SUMS files...")
        for filename in default_sums_filenames:
            try:
                sums_url = "%s/%s" % (folder, filename)
                sumsRequest = urllib.request.Request(sums_url, **self.requestArgs)
                obj = urllib.request.urlopen(sumsRequest)
                data = obj.read().split('\n')
                obj.close()
                
                for line in data:
                    if orig_basename.lower() in line.lower():
                        self.logger.info("Found a matching hash in %s" % sums_url)
                        algo = filename.rstrip('SUMS')
                        hash = line.split(' ')[0]
                        self.add_hash_verification(algo, hash)
                        return
                
            except urllib.error.HTTPError:
                continue
        
    def start(self, blocking=None):
        '''
        Starts the download task. Will raise `RuntimeError` if it's the object's already downloading.
        
        .. warning::
            If you're using the non-blocking mode, Exceptions won't be raised. In that case, call
            `isSuccessful()` after the task is finished, to make sure the download succeeded. Call
            `get_errors()` to get the the exceptions.
        
        :param blocking: If true, calling this function will block the thread until the download finished. Default is *True*.
        :type blocking: bool
        '''
        if not self.status == "ready":
            raise RuntimeError("cannot start (current status is {})".format(self.status))
        self.logger.info('Starting a new SmartDL operation.')

        if blocking is None:
            blocking = self._start_func_blocking
        else:
            self._start_func_blocking = blocking
        
        if self.mirrors:
            self.logger.info('One URL and {} mirrors are loaded.'.format(len(self.mirrors)))
        else:
            self.logger.info('One URL is loaded.')
        
        if self.verify_hash and os.path.exists(self.dest):
            if utils.get_file_hash(self.hash_algorithm, self.dest) == self.hash_code:
                self.logger.info("Destination '%s' already exists, and the hash matches. No need to download." % self.dest)
                self.status = 'finished'
                return

        self.logger.info("Downloading '{}' to '{}'...".format(self.url, self.dest))
        req = urllib.request.Request(self.url, **self.requestArgs)
        try:
            urlObj = urllib.request.urlopen(req, timeout=self.timeout, context=self.context)
        except (urllib.error.HTTPError, urllib.error.URLError, socket.timeout) as e:
            self.errors.append(e)
            if self.mirrors:
                self.logger.info("{} Trying next mirror...".format(str(e)))
                self.url = self.mirrors.pop(0)
                self.logger.info('Using url "{}"'.format(self.url))
                self.start(blocking)
                return
            else:
                self.logger.warning(str(e))
                self.errors.append(e)
                self._failed = True
                self.status = "finished"
                raise
        
        try:
            self.filesize = int(urlObj.headers["Content-Length"])
            self.logger.info("Content-Length is {} ({}).".format(self.filesize, utils.sizeof_human(self.filesize)))
        except (IndexError, KeyError, TypeError):
            self.logger.warning("Server did not send Content-Length. Filesize is unknown.")
            self.filesize = 0
            
        args = utils.calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile)
        bytes_per_thread = args[0][1] - args[0][0] + 1
        if len(args)>1:
            self.logger.info("Launching {} threads (downloads {}/thread).".format(len(args),  utils.sizeof_human(bytes_per_thread)))
        else:
            self.logger.info("Launching 1 thread (downloads {}).".format(utils.sizeof_human(bytes_per_thread)))
        
        self.status = "downloading"
        
        for i, arg in enumerate(args):
            req = self.pool.submit(
                download,
                self.url,
                self.dest+".%.3d" % i,
                self.requestArgs,
                self.context,
                arg[0],
                arg[1],
                self.timeout,
                self.shared_var,
                self.thread_shared_cmds,
                self.logger
            )
        
        self.post_threadpool_thread = threading.Thread(
            target=post_threadpool_actions,
            args=(
                self.pool,
                [[(self.dest+".%.3d" % i) for i in range(len(args))], self.dest],
                self.filesize,
                self
            )
        )
        self.post_threadpool_thread.daemon = True
        self.post_threadpool_thread.start()
        
        self.control_thread = ControlThread(self)
        
        if blocking:
            self.wait(raise_exceptions=True)
            
    def _exc_callback(self, req, e):
        self.errors.append(e[0])
        self.logger.exception(e[1])
        
    def retry(self, eStr=""):
        if self.current_attemp < self.attemps_limit:
            self.current_attemp += 1
            self.status = "ready"
            self.shared_var.value = 0
            self.thread_shared_cmds = {}
            self.start()
             
        else:
            s = 'The maximum retry attempts reached'
            if eStr:
                s += " ({})".format(eStr)
            self.errors.append(urllib.error.HTTPError(self.url, "0", s, {}, StringIO()))
            self._failed = True
            
    def try_next_mirror(self, e=None):
        if self.mirrors:
            if e:
                self.errors.append(e)
            self.status = "ready"
            self.shared_var.value = 0
            self.url = self.mirrors.pop(0)
            self.logger.info('Using url "{}"'.format(self.url))
            self.start()
        else:
            self._failed = True
            self.errors.append(e)
    
    def get_eta(self, human=False):
        '''
        Get estimated time of download completion, in seconds. Returns `0` if there is
        no enough data to calculate the estimated time (this will happen on the approx.
        first 5 seconds of each download).
        
        :param human: If true, returns a human-readable formatted string. Else, returns an int type number
        :type human: bool
        :rtype: int/string
        '''
        if human:
            s = utils.time_human(self.control_thread.get_eta())
            return s if s else "TBD"
        return self.control_thread.get_eta()

    def get_speed(self, human=False):
        '''
        Get current transfer speed in bytes per second.
        
        :param human: If true, returns a human-readable formatted string. Else, returns an int type number
        :type human: bool
        :rtype: int/string
        '''
        if human:
            return "{}/s".format(utils.sizeof_human(self.control_thread.get_speed()))
        return self.control_thread.get_speed()

    def get_progress(self):
        '''
        Returns the current progress of the download, as a float between `0` and `1`.
        
        :rtype: float
        '''
        if not self.filesize:
            return 0
        if self.control_thread.get_dl_size() <= self.filesize:
            return 1.0*self.control_thread.get_dl_size()/self.filesize
        return 1.0

    def get_progress_bar(self, length=20):
        '''
        Returns the current progress of the download as a string containing a progress bar.
        
        .. NOTE::
            That's an alias for pySmartDL.utils.progress_bar(obj.get_progress()).
        
        :param length: The length of the progress bar in chars. Default is 20.
        :type length: int
        :rtype: string
        '''
        return utils.progress_bar(self.get_progress(), length)

    def isFinished(self):
        '''
        Returns if the task is finished.
        
        :rtype: bool
        '''
        if self.status == "ready":
            return False
        if self.status == "finished":
            return True
        return not self.post_threadpool_thread.is_alive()

    def isSuccessful(self):
        '''
        Returns if the download is successfull. It may fail in the following scenarios:
        
        - Hash check is enabled and fails.
        - All mirrors are down.
        - Any local I/O problems (such as `no disk space available`).
        
        .. NOTE::
            Call `get_errors()` to get the exceptions, if any.
        
        Will raise `RuntimeError` if it's called when the download task is not finished yet.
        
        :rtype: bool
        '''
        
        if self._killed:
            return False
        
        n = 0
        while self.status != 'finished':
            n += 1
            time.sleep(0.1)
            if n >= 15:
                raise RuntimeError("The download task must be finished in order to see if it's successful. (current status is {})".format(self.status))
            
        return not self._failed
        
    def get_errors(self):
        '''
        Get errors happened while downloading.
        
        :rtype: list of `Exception` instances
        '''
        return self.errors
        
    def get_status(self):
        '''
        Returns the current status of the task. Possible values: *ready*,
        *downloading*, *paused*, *combining*, *finished*.
        
        :rtype: string
        '''
        return self.status

    def wait(self, raise_exceptions=False):
        '''
        Blocks until the download is finished.
        
        :param raise_exceptions: If true, this function will raise exceptions. Default is *False*.
        :type raise_exceptions: bool
        '''
        if self.status in ["ready", "finished"]:
            return
            
        while not self.isFinished():
            time.sleep(0.1)
        self.post_threadpool_thread.join()
        self.control_thread.join()
        
        if self._failed and raise_exceptions:
            raise self.errors[-1]

    def stop(self):
        '''
        Stops the download.
        '''
        if self.status == "downloading":
            self.thread_shared_cmds['stop'] = ""
            self._killed = True

    def pause(self):
        '''
        Pauses the download.
        '''
        if self.status == "downloading":
            self.status = "paused"
            self.thread_shared_cmds['pause'] = ""

    def resume(self):
        '''
        Continues the download. same as unpause().
        '''
        self.unpause()

    def unpause(self):
        '''
        Continues the download. same as resume().
        '''
        if self.status == "paused" and 'pause' in self.thread_shared_cmds:
            self.status = "downloading"
            del self.thread_shared_cmds['pause']
    
    def limit_speed(self, speed):
        '''
        Limits the download transfer speed.
        
        :param speed: Speed in bytes per download per second. Negative values will not limit the speed. Default is `-1`.
        :type speed: int
        '''
        if self.status == "downloading":
            if speed == 0:
                self.pause()
            else:
                self.unpause()

        if speed > 0:
            self.thread_shared_cmds['limit'] = speed/self.threads_count
        elif 'limit' in self.thread_shared_cmds:
            del self.thread_shared_cmds['limit']
        
    def get_dest(self):
        '''
        Get the destination path of the downloaded file. Needed when no
        destination is provided to the class, and exists on a temp folder.
        
        :rtype: string
        '''
        return self.dest
    def get_dl_time(self, human=False):
        '''
        Returns how much time did the download take, in seconds. Returns
        `-1` if the download task is not finished yet.

        :param human: If true, returns a human-readable formatted string. Else, returns an int type number
        :type human: bool
        :rtype: int/string
        '''
        if not self.control_thread:
            return 0
        if human:
            return utils.time_human(self.control_thread.get_dl_time())
        return self.control_thread.get_dl_time()
        
    def get_dl_size(self, human=False):
        '''
        Get downloaded bytes counter in bytes.
        
        :param human: If true, returns a human-readable formatted string. Else, returns an int type number
        :type human: bool
        :rtype: int/string
        '''
        if not self.control_thread:
            return 0
        if human:
            return utils.sizeof_human(self.control_thread.get_dl_size())    
        return self.control_thread.get_dl_size()

    def get_final_filesize(self, human=False):
        '''
        Get total download size in bytes.
        
        :param human: If true, returns a human-readable formatted string. Else, returns an int type number
        :type human: bool
        :rtype: int/string
        '''
        if not self.control_thread:
            return 0
        if human:
            return utils.sizeof_human(self.control_thread.get_final_filesize())    
        return self.control_thread.get_final_filesize()
    
    
    def get_data(self, binary=False, bytes=-1):
        '''
        Returns the downloaded data. Will raise `RuntimeError` if it's
        called when the download task is not finished yet.
        
        :param binary: If true, will read the data as binary. Else, will read it as text.
        :type binary: bool
        :param bytes: Number of bytes to read. Negative values will read until EOF. Default is `-1`.
        :type bytes: int
        :rtype: string
        '''
        if self.status != 'finished':
            raise RuntimeError("The download task must be finished in order to read the data. (current status is %s)" % self.status)
            
        flags = 'rb' if binary else 'r'
        with open(self.get_dest(), flags) as f:
            data = f.read(bytes) if bytes>0 else f.read()
        return data
        
    def get_data_hash(self, algorithm):
        '''
        Returns the downloaded data's hash. Will raise `RuntimeError` if it's
        called when the download task is not finished yet.
        
        :param algorithm: Hashing algorithm.
        :type algorithm: bool
        :rtype: string
        
        .. WARNING::
            The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_.
        '''
        return hashlib.new(algorithm, self.get_data(binary=True)).hexdigest()

    def get_json(self):
        '''
        Returns the JSON in the downloaded data. Will raise `RuntimeError` if it's
        called when the download task is not finished yet. Will raise `json.decoder.JSONDecodeError`
        if the downloaded data is not valid JSON.
        
        :rtype: dict
        '''
        data = self.get_data()
        return json.loads(data)

def post_threadpool_actions(pool, args, expected_filesize, SmartDLObj):
    "Run function after thread pool is done. Run this in a thread."
    while not pool.done():
        time.sleep(0.1)

    if SmartDLObj._killed:
        return
        
    if pool.get_exception():
        for exc in pool.get_exceptions():
            SmartDLObj.logger.exception(exc)
            
        SmartDLObj.retry(str(pool.get_exception()))
       
    if SmartDLObj._failed:
        SmartDLObj.logger.warning("Task had errors. Exiting...")
        return
        
    if expected_filesize:  # if not zero, expected filesize is known
        threads = len(args[0])
        total_filesize = sum([os.path.getsize(x) for x in args[0]])
        diff = math.fabs(expected_filesize - total_filesize)
        
        # if the difference is more than 4*thread numbers (because a thread may download 4KB extra per thread because of NTFS's block size)
        if diff > 4*1024*threads:
            errMsg = 'Diff between downloaded files and expected filesizes is {}B (filesize: {}, expected_filesize: {}, {} threads).'.format(total_filesize, expected_filesize, diff, threads)
            SmartDLObj.logger.warning(errMsg)
            SmartDLObj.retry(errMsg)
            return
    
    SmartDLObj.status = "combining"
    utils.combine_files(*args)
    
    if SmartDLObj.verify_hash:
        dest_path = args[-1]            
        hash_ = utils.get_file_hash(SmartDLObj.hash_algorithm, dest_path)
	
        if hash_ == SmartDLObj.hash_code:
            SmartDLObj.logger.info('Hash verification succeeded.')
        else:
            SmartDLObj.logger.warning('Hash verification failed.')
            SmartDLObj.try_next_mirror(HashFailedException(os.path.basename(dest_path), hash, SmartDLObj.hash_code))