Nils
3 years ago
15 changed files with 1437 additions and 30 deletions
@ -0,0 +1,24 @@ |
|||||
|
This is free and unencumbered software released into the public domain. |
||||
|
|
||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or |
||||
|
distribute this software, either in source code form or as a compiled |
||||
|
binary, for any purpose, commercial or non-commercial, and by any |
||||
|
means. |
||||
|
|
||||
|
In jurisdictions that recognize copyright laws, the author or authors |
||||
|
of this software dedicate any and all copyright interest in the |
||||
|
software to the public domain. We make this dedication for the benefit |
||||
|
of the public at large and to the detriment of our heirs and |
||||
|
successors. We intend this dedication to be an overt act of |
||||
|
relinquishment in perpetuity of all present and future rights to this |
||||
|
software under copyright law. |
||||
|
|
||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR |
||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
||||
|
OTHER DEALINGS IN THE SOFTWARE. |
||||
|
|
||||
|
For more information, please refer to <http://unlicense.org/> |
@ -0,0 +1,59 @@ |
|||||
|
Python Smart Download Manager -- pySmartDL |
||||
|
========================================== |
||||
|
|
||||
|
``pySmartDL`` strives to be a full-fledged smart download manager for Python. Main features: |
||||
|
|
||||
|
* Built-in download acceleration (with the `multipart downloading technique <http://stackoverflow.com/questions/93642/how-do-download-accelerators-work>`_). |
||||
|
* Mirrors support. |
||||
|
* Pause/Unpause feature. |
||||
|
* Speed limiting feature. |
||||
|
* Hash checking. |
||||
|
* Non-blocking, shows progress bar, download speed and eta. |
||||
|
* Full support for custom headers and methods. |
||||
|
* Python 3 Support |
||||
|
|
||||
|
Project Links |
||||
|
============= |
||||
|
|
||||
|
* Downloads: http://pypi.python.org/pypi/pySmartDL/ |
||||
|
* Documentation: http://itaybb.github.io/pySmartDL/ |
||||
|
* Project page: https://github.com/iTaybb/pySmartDL/ |
||||
|
* Bugs and Issues: https://github.com/iTaybb/pySmartDL/issues |
||||
|
|
||||
|
Installation |
||||
|
============ |
||||
|
|
||||
|
**Using pip (recommended way)** |
||||
|
|
||||
|
Make sure python-pip is installed on you system. If you are using virtualenv, then pip is alredy installed into environments created by virtualenv. Run pip to install pySmartDL: |
||||
|
|
||||
|
``pip install pySmartDL`` |
||||
|
|
||||
|
**From Source** |
||||
|
|
||||
|
The pySmartDL package is installed from source using distutils in the usual way. Download the `source distribution <http://pypi.python.org/pypi/pySmartDL>`_ first. Unpack the source zip and run the following to install the package site-wide: |
||||
|
|
||||
|
``python setup.py install`` |
||||
|
|
||||
|
Usage |
||||
|
===== |
||||
|
|
||||
|
Download is as simple as creating an instance and starting it: |
||||
|
|
||||
|
from pySmartDL import SmartDL |
||||
|
|
||||
|
url = "https://github.com/iTaybb/pySmartDL/raw/master/test/7za920.zip" |
||||
|
dest = "C:\\Downloads\\" # or '~/Downloads/' on linux |
||||
|
|
||||
|
obj = SmartDL(url, dest) |
||||
|
obj.start() |
||||
|
# [*] 0.23 Mb / 0.37 Mb @ 88.00Kb/s [##########--------] [60%, 2s left] |
||||
|
|
||||
|
path = obj.get_dest() |
||||
|
|
||||
|
Requirements |
||||
|
============== |
||||
|
|
||||
|
* Python 3.4 or greater. |
||||
|
|
||||
|
Copyright (C) 2014-2020 Itay Brandes. |
@ -0,0 +1,4 @@ |
|||||
|
from .pySmartDL import SmartDL, HashFailedException, CanceledException |
||||
|
from . import utils |
||||
|
|
||||
|
__version__ = pySmartDL.__version__ |
@ -0,0 +1,119 @@ |
|||||
|
import threading |
||||
|
import time |
||||
|
|
||||
|
from . import utils |
||||
|
|
||||
|
class ControlThread(threading.Thread): |
||||
|
"A class that shows information about a running SmartDL object." |
||||
|
def __init__(self, obj): |
||||
|
threading.Thread.__init__(self) |
||||
|
self.obj = obj |
||||
|
self.progress_bar = obj.progress_bar |
||||
|
self.logger = obj.logger |
||||
|
self.shared_var = obj.shared_var |
||||
|
|
||||
|
self.dl_speed = 0 |
||||
|
self.eta = 0 |
||||
|
self.lastBytesSamples = [] # list with last 50 Bytes Samples. |
||||
|
self.last_calculated_totalBytes = 0 |
||||
|
self.calcETA_queue = [] |
||||
|
self.calcETA_i = 0 |
||||
|
self.calcETA_val = 0 |
||||
|
self.dl_time = -1.0 |
||||
|
|
||||
|
self.daemon = True |
||||
|
self.start() |
||||
|
|
||||
|
def run(self): |
||||
|
t1 = time.time() |
||||
|
self.logger.info("Control thread has been started.") |
||||
|
|
||||
|
while not self.obj.pool.done(): |
||||
|
self.dl_speed = self.calcDownloadSpeed(self.shared_var.value) |
||||
|
if self.dl_speed > 0: |
||||
|
self.eta = self.calcETA((self.obj.filesize-self.shared_var.value)/self.dl_speed) |
||||
|
|
||||
|
if self.progress_bar: |
||||
|
if self.obj.filesize: |
||||
|
status = r"[*] %s / %s @ %s/s %s [%3.1f%%, %s left] " % (utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar(1.0*self.shared_var.value/self.obj.filesize), self.shared_var.value * 100.0 / self.obj.filesize, utils.time_human(self.eta, fmt_short=True)) |
||||
|
else: |
||||
|
status = r"[*] %s / ??? MB @ %s/s " % (utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.dl_speed)) |
||||
|
status = status + chr(8)*(len(status)+1) |
||||
|
print(status, end=' ', flush=True) |
||||
|
time.sleep(0.1) |
||||
|
|
||||
|
if self.obj._killed: |
||||
|
self.logger.info("File download process has been stopped.") |
||||
|
return |
||||
|
|
||||
|
if self.progress_bar: |
||||
|
if self.obj.filesize: |
||||
|
print(r"[*] %s / %s @ %s/s %s [100%%, 0s left] " % (utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar(1.0))) |
||||
|
else: |
||||
|
print(r"[*] %s / %s @ %s/s " % (utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.dl_speed))) |
||||
|
|
||||
|
t2 = time.time() |
||||
|
self.dl_time = float(t2-t1) |
||||
|
|
||||
|
while self.obj.post_threadpool_thread.is_alive(): |
||||
|
time.sleep(0.1) |
||||
|
|
||||
|
self.obj.pool.shutdown() |
||||
|
self.obj.status = "finished" |
||||
|
if not self.obj.errors: |
||||
|
self.logger.info("File downloaded within %.2f seconds." % self.dl_time) |
||||
|
|
||||
|
def get_eta(self): |
||||
|
if self.eta <= 0 or self.obj.status == 'paused': |
||||
|
return 0 |
||||
|
return self.eta |
||||
|
def get_speed(self): |
||||
|
if self.obj.status == 'paused': |
||||
|
return 0 |
||||
|
return self.dl_speed |
||||
|
def get_dl_size(self): |
||||
|
if self.shared_var.value > self.obj.filesize: |
||||
|
return self.obj.filesize |
||||
|
return self.shared_var.value |
||||
|
def get_final_filesize(self): |
||||
|
return self.obj.filesize |
||||
|
def get_progress(self): |
||||
|
if not self.obj.filesize: |
||||
|
return 0 |
||||
|
return 1.0*self.shared_var.value/self.obj.filesize |
||||
|
def get_dl_time(self): |
||||
|
return self.dl_time |
||||
|
|
||||
|
def calcDownloadSpeed(self, totalBytes, sampleCount=30, sampleDuration=0.1): |
||||
|
''' |
||||
|
Function calculates the download rate. |
||||
|
@param totalBytes: The total amount of bytes. |
||||
|
@param sampleCount: How much samples should the function take into consideration. |
||||
|
@param sampleDuration: Duration of a sample in seconds. |
||||
|
''' |
||||
|
l = self.lastBytesSamples |
||||
|
newBytes = totalBytes - self.last_calculated_totalBytes |
||||
|
self.last_calculated_totalBytes = totalBytes |
||||
|
if newBytes >= 0: # newBytes may be negetive, will happen |
||||
|
# if a thread has crushed and the totalBytes counter got decreased. |
||||
|
if len(l) == sampleCount: # calc download for last 3 seconds (30 * 100ms per signal emit) |
||||
|
l.pop(0) |
||||
|
|
||||
|
l.append(newBytes) |
||||
|
|
||||
|
dlRate = sum(l)/len(l)/sampleDuration |
||||
|
return dlRate |
||||
|
|
||||
|
def calcETA(self, eta): |
||||
|
self.calcETA_i += 1 |
||||
|
l = self.calcETA_queue |
||||
|
l.append(eta) |
||||
|
|
||||
|
if self.calcETA_i % 10 == 0: |
||||
|
self.calcETA_val = sum(l)/len(l) |
||||
|
if len(l) == 30: |
||||
|
l.pop(0) |
||||
|
|
||||
|
if self.calcETA_i < 50: |
||||
|
return 0 |
||||
|
return self.calcETA_val |
@ -0,0 +1,88 @@ |
|||||
|
import os |
||||
|
import urllib.request, urllib.error, urllib.parse |
||||
|
import time |
||||
|
from . import utils |
||||
|
|
||||
|
def download(url, dest, requestArgs=None, context=None, startByte=0, endByte=None, timeout=4, shared_var=None, thread_shared_cmds=None, logger=None, retries=3): |
||||
|
"The basic download function that runs at each thread." |
||||
|
logger = logger or utils.DummyLogger() |
||||
|
req = urllib.request.Request(url, **requestArgs) |
||||
|
if endByte: |
||||
|
req.add_header('Range', 'bytes={:.0f}-{:.0f}'.format(startByte, endByte)) |
||||
|
logger.info("Downloading '{}' to '{}'...".format(url, dest)) |
||||
|
try: |
||||
|
# Context is used to skip ssl validation if verify is False. |
||||
|
urlObj = urllib.request.urlopen(req, timeout=timeout, context=context) |
||||
|
except urllib.error.HTTPError as e: |
||||
|
if e.code == 416: |
||||
|
''' |
||||
|
HTTP 416 Error: Requested Range Not Satisfiable. Happens when we ask |
||||
|
for a range that is not available on the server. It will happen when |
||||
|
the server will try to send us a .html page that means something like |
||||
|
"you opened too many connections to our server". If this happens, we |
||||
|
will wait for the other threads to finish their connections and try again. |
||||
|
''' |
||||
|
|
||||
|
if retries > 0: |
||||
|
logger.warning("Thread didn't got the file it was expecting. Retrying ({} times left)...".format(retries-1)) |
||||
|
time.sleep(5) |
||||
|
return download(url, dest, requestArgs, startByte, endByte, timeout, shared_var, thread_shared_cmds, logger, retries-1) |
||||
|
else: |
||||
|
raise |
||||
|
else: |
||||
|
raise |
||||
|
|
||||
|
with open(dest, 'wb') as f: |
||||
|
if endByte: |
||||
|
filesize = endByte-startByte |
||||
|
else: |
||||
|
try: |
||||
|
meta = urlObj.info() |
||||
|
filesize = int(urlObj.headers["Content-Length"]) |
||||
|
logger.info("Content-Length is {}.".format(filesize)) |
||||
|
except (IndexError, KeyError, TypeError): |
||||
|
logger.warning("Server did not send Content-Length. Filesize is unknown.") |
||||
|
|
||||
|
filesize_dl = 0 # total downloaded size |
||||
|
limitspeed_timestamp = time.time() |
||||
|
limitspeed_filesize = 0 |
||||
|
block_sz = 8192 |
||||
|
while True: |
||||
|
if thread_shared_cmds: |
||||
|
if 'stop' in thread_shared_cmds: |
||||
|
logger.info('stop command received. Stopping.') |
||||
|
raise CanceledException() |
||||
|
if 'pause' in thread_shared_cmds: |
||||
|
time.sleep(0.2) |
||||
|
continue |
||||
|
if 'limit' in thread_shared_cmds: |
||||
|
now = time.time() |
||||
|
time_passed = now - limitspeed_timestamp |
||||
|
if time_passed > 0.1: # we only observe the limit after 100ms |
||||
|
# if we passed the limit, we should |
||||
|
if (filesize_dl-limitspeed_filesize)/time_passed >= thread_shared_cmds['limit']: |
||||
|
time_to_sleep = (filesize_dl-limitspeed_filesize) / thread_shared_cmds['limit'] |
||||
|
logger.debug('Thread has downloaded {} in {}. Limit is {}/s. Slowing down...'.format(utils.sizeof_human(filesize_dl-limitspeed_filesize), utils.time_human(time_passed, fmt_short=True, show_ms=True), utils.sizeof_human(thread_shared_cmds['limit']))) |
||||
|
time.sleep(time_to_sleep) |
||||
|
continue |
||||
|
else: |
||||
|
limitspeed_timestamp = now |
||||
|
limitspeed_filesize = filesize_dl |
||||
|
|
||||
|
try: |
||||
|
buff = urlObj.read(block_sz) |
||||
|
except Exception as e: |
||||
|
logger.error(str(e)) |
||||
|
if shared_var: |
||||
|
shared_var.value -= filesize_dl |
||||
|
raise |
||||
|
|
||||
|
if not buff: |
||||
|
break |
||||
|
|
||||
|
filesize_dl += len(buff) |
||||
|
if shared_var: |
||||
|
shared_var.value += len(buff) |
||||
|
f.write(buff) |
||||
|
|
||||
|
urlObj.close() |
@ -0,0 +1,675 @@ |
|||||
|
import os |
||||
|
import sys |
||||
|
import urllib.request, urllib.error, urllib.parse |
||||
|
import copy |
||||
|
import threading |
||||
|
import time |
||||
|
import math |
||||
|
import tempfile |
||||
|
import base64 |
||||
|
import hashlib |
||||
|
import socket |
||||
|
import logging |
||||
|
from io import StringIO |
||||
|
import multiprocessing.dummy as multiprocessing |
||||
|
from ctypes import c_int |
||||
|
import json |
||||
|
import ssl |
||||
|
|
||||
|
from . import utils |
||||
|
from .control_thread import ControlThread |
||||
|
from .download import download |
||||
|
|
||||
|
__all__ = ['SmartDL', 'utils'] |
||||
|
__version_mjaor__ = 1 |
||||
|
__version_minor__ = 3 |
||||
|
__version_micro__ = 4 |
||||
|
__version__ = "{}.{}.{}".format(__version_mjaor__, __version_minor__, __version_micro__) |
||||
|
|
||||
|
class HashFailedException(Exception): |
||||
|
"Raised when hash check fails." |
||||
|
def __init__(self, fn, calc_hash, needed_hash): |
||||
|
self.filename = fn |
||||
|
self.calculated_hash = calc_hash |
||||
|
self.needed_hash = needed_hash |
||||
|
def __str__(self): |
||||
|
return 'HashFailedException({}, got {}, expected {})'.format(self.filename, self.calculated_hash, self.needed_hash) |
||||
|
def __repr__(self): |
||||
|
return '<HashFailedException {}, got {}, expected {}>'.format(self.filename, self.calculated_hash, self.needed_hash) |
||||
|
|
||||
|
class CanceledException(Exception): |
||||
|
"Raised when the job is canceled." |
||||
|
def __init__(self): |
||||
|
pass |
||||
|
def __str__(self): |
||||
|
return 'CanceledException' |
||||
|
def __repr__(self): |
||||
|
return "<CanceledException>" |
||||
|
|
||||
|
class SmartDL: |
||||
|
''' |
||||
|
The main SmartDL class |
||||
|
|
||||
|
:param urls: Download url. It is possible to pass unsafe and unicode characters. You can also pass a list of urls, and those will be used as mirrors. |
||||
|
:type urls: string or list of strings |
||||
|
:param dest: Destination path. Default is `%TEMP%/pySmartDL/`. |
||||
|
:type dest: string |
||||
|
:param progress_bar: If True, prints a progress bar to the `stdout stream <http://docs.python.org/2/library/sys.html#sys.stdout>`_. Default is `True`. |
||||
|
:type progress_bar: bool |
||||
|
:param fix_urls: If true, attempts to fix urls with unsafe characters. |
||||
|
:type fix_urls: bool |
||||
|
:param threads: Number of threads to use. |
||||
|
:type threads: int |
||||
|
:param timeout: Timeout for network operations, in seconds. Default is 5. |
||||
|
:type timeout: int |
||||
|
:param logger: An optional logger. |
||||
|
:type logger: `logging.Logger` instance |
||||
|
:param connect_default_logger: If true, connects a default logger to the class. |
||||
|
:type connect_default_logger: bool |
||||
|
:param request_args: Arguments to be passed to a new urllib.request.Request instance in dictionary form. See `urllib.request docs <https://docs.python.org/3/library/urllib.request.html#urllib.request.Request>`_ for options. |
||||
|
:type request_args: dict |
||||
|
:rtype: `SmartDL` instance |
||||
|
:param verify: If ssl certificates should be validated. |
||||
|
:type verify: bool |
||||
|
|
||||
|
.. NOTE:: |
||||
|
The provided dest may be a folder or a full path name (including filename). The workflow is: |
||||
|
|
||||
|
* If the path exists, and it's an existing folder, the file will be downloaded to there with the original filename. |
||||
|
* If the past does not exist, it will create the folders, if needed, and refer to the last section of the path as the filename. |
||||
|
* If you want to download to folder that does not exist at the moment, and want the module to fill in the filename, make sure the path ends with `os.sep`. |
||||
|
* If no path is provided, `%TEMP%/pySmartDL/` will be used. |
||||
|
''' |
||||
|
|
||||
|
def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, threads=5, timeout=5, logger=None, connect_default_logger=False, request_args=None, verify=True): |
||||
|
if logger: |
||||
|
self.logger = logger |
||||
|
elif connect_default_logger: |
||||
|
self.logger = utils.create_debugging_logger() |
||||
|
else: |
||||
|
self.logger = utils.DummyLogger() |
||||
|
if request_args: |
||||
|
if "headers" not in request_args: |
||||
|
request_args["headers"] = dict() |
||||
|
self.requestArgs = request_args |
||||
|
else: |
||||
|
self.requestArgs = {"headers": dict()} |
||||
|
if "User-Agent" not in self.requestArgs["headers"]: |
||||
|
self.requestArgs["headers"]["User-Agent"] = utils.get_random_useragent() |
||||
|
self.mirrors = [urls] if isinstance(urls, str) else urls |
||||
|
if fix_urls: |
||||
|
self.mirrors = [utils.url_fix(x) for x in self.mirrors] |
||||
|
self.url = self.mirrors.pop(0) |
||||
|
self.logger.info('Using url "{}"'.format(self.url)) |
||||
|
|
||||
|
fn = urllib.parse.unquote(os.path.basename(urllib.parse.urlparse(self.url).path)) |
||||
|
self.dest = dest or os.path.join(tempfile.gettempdir(), 'pySmartDL', fn) |
||||
|
if self.dest[-1] == os.sep: |
||||
|
if os.path.exists(self.dest[:-1]) and os.path.isfile(self.dest[:-1]): |
||||
|
os.unlink(self.dest[:-1]) |
||||
|
self.dest += fn |
||||
|
if os.path.isdir(self.dest): |
||||
|
self.dest = os.path.join(self.dest, fn) |
||||
|
|
||||
|
self.progress_bar = progress_bar |
||||
|
self.threads_count = threads |
||||
|
self.timeout = timeout |
||||
|
self.current_attemp = 1 |
||||
|
self.attemps_limit = 4 |
||||
|
self.minChunkFile = 1024**2*2 # 2MB |
||||
|
self.filesize = 0 |
||||
|
self.shared_var = multiprocessing.Value(c_int, 0) # a ctypes var that counts the bytes already downloaded |
||||
|
self.thread_shared_cmds = {} |
||||
|
self.status = "ready" |
||||
|
self.verify_hash = False |
||||
|
self._killed = False |
||||
|
self._failed = False |
||||
|
self._start_func_blocking = True |
||||
|
self.errors = [] |
||||
|
|
||||
|
self.post_threadpool_thread = None |
||||
|
self.control_thread = None |
||||
|
|
||||
|
if not os.path.exists(os.path.dirname(self.dest)): |
||||
|
self.logger.info('Folder "{}" does not exist. Creating...'.format(os.path.dirname(self.dest))) |
||||
|
os.makedirs(os.path.dirname(self.dest)) |
||||
|
if not utils.is_HTTPRange_supported(self.url, timeout=self.timeout): |
||||
|
self.logger.warning("Server does not support HTTPRange. threads_count is set to 1.") |
||||
|
self.threads_count = 1 |
||||
|
if os.path.exists(self.dest): |
||||
|
self.logger.warning('Destination "{}" already exists. Existing file will be removed.'.format(self.dest)) |
||||
|
if not os.path.exists(os.path.dirname(self.dest)): |
||||
|
self.logger.warning('Directory "{}" does not exist. Creating it...'.format(os.path.dirname(self.dest))) |
||||
|
os.makedirs(os.path.dirname(self.dest)) |
||||
|
|
||||
|
self.logger.info("Creating a ThreadPool of {} thread(s).".format(self.threads_count)) |
||||
|
self.pool = utils.ManagedThreadPoolExecutor(self.threads_count) |
||||
|
|
||||
|
if verify: |
||||
|
self.context = None |
||||
|
else: |
||||
|
self.context = ssl.create_default_context() |
||||
|
self.context.check_hostname = False |
||||
|
self.context.verify_mode = ssl.CERT_NONE |
||||
|
|
||||
|
def __str__(self): |
||||
|
return 'SmartDL(r"{}", dest=r"{}")'.format(self.url, self.dest) |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return "<SmartDL {}>".format(self.url) |
||||
|
|
||||
|
def add_basic_authentication(self, username, password): |
||||
|
''' |
||||
|
Uses HTTP Basic Access authentication for the connection. |
||||
|
|
||||
|
:param username: Username. |
||||
|
:type username: string |
||||
|
:param password: Password. |
||||
|
:type password: string |
||||
|
''' |
||||
|
auth_string = '{}:{}'.format(username, password) |
||||
|
base64string = base64.standard_b64encode(auth_string.encode('utf-8')) |
||||
|
self.requestArgs['headers']['Authorization'] = b"Basic " + base64string |
||||
|
|
||||
|
def add_hash_verification(self, algorithm, hash): |
||||
|
''' |
||||
|
Adds hash verification to the download. |
||||
|
|
||||
|
If hash is not correct, will try different mirrors. If all mirrors aren't |
||||
|
passing hash verification, `HashFailedException` Exception will be raised. |
||||
|
|
||||
|
.. NOTE:: |
||||
|
If downloaded file already exist on the destination, and hash matches, pySmartDL will not download it again. |
||||
|
|
||||
|
.. WARNING:: |
||||
|
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_. |
||||
|
|
||||
|
:param algorithm: Hashing algorithm. |
||||
|
:type algorithm: string |
||||
|
:param hash: Hash code. |
||||
|
:type hash: string |
||||
|
''' |
||||
|
|
||||
|
self.verify_hash = True |
||||
|
self.hash_algorithm = algorithm |
||||
|
self.hash_code = hash |
||||
|
|
||||
|
def fetch_hash_sums(self): |
||||
|
''' |
||||
|
Will attempt to fetch UNIX hash sums files (`SHA256SUMS`, `SHA1SUMS` or `MD5SUMS` files in |
||||
|
the same url directory). |
||||
|
|
||||
|
Calls `self.add_hash_verification` if successful. Returns if a matching hash was found. |
||||
|
|
||||
|
:rtype: bool |
||||
|
|
||||
|
*New in 1.2.1* |
||||
|
''' |
||||
|
default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS'] |
||||
|
folder = os.path.dirname(self.url) |
||||
|
orig_basename = os.path.basename(self.url) |
||||
|
|
||||
|
self.logger.info("Looking for SUMS files...") |
||||
|
for filename in default_sums_filenames: |
||||
|
try: |
||||
|
sums_url = "%s/%s" % (folder, filename) |
||||
|
sumsRequest = urllib.request.Request(sums_url, **self.requestArgs) |
||||
|
obj = urllib.request.urlopen(sumsRequest) |
||||
|
data = obj.read().split('\n') |
||||
|
obj.close() |
||||
|
|
||||
|
for line in data: |
||||
|
if orig_basename.lower() in line.lower(): |
||||
|
self.logger.info("Found a matching hash in %s" % sums_url) |
||||
|
algo = filename.rstrip('SUMS') |
||||
|
hash = line.split(' ')[0] |
||||
|
self.add_hash_verification(algo, hash) |
||||
|
return |
||||
|
|
||||
|
except urllib.error.HTTPError: |
||||
|
continue |
||||
|
|
||||
|
def start(self, blocking=None): |
||||
|
''' |
||||
|
Starts the download task. Will raise `RuntimeError` if it's the object's already downloading. |
||||
|
|
||||
|
.. warning:: |
||||
|
If you're using the non-blocking mode, Exceptions won't be raised. In that case, call |
||||
|
`isSuccessful()` after the task is finished, to make sure the download succeeded. Call |
||||
|
`get_errors()` to get the the exceptions. |
||||
|
|
||||
|
:param blocking: If true, calling this function will block the thread until the download finished. Default is *True*. |
||||
|
:type blocking: bool |
||||
|
''' |
||||
|
if not self.status == "ready": |
||||
|
raise RuntimeError("cannot start (current status is {})".format(self.status)) |
||||
|
self.logger.info('Starting a new SmartDL operation.') |
||||
|
|
||||
|
if blocking is None: |
||||
|
blocking = self._start_func_blocking |
||||
|
else: |
||||
|
self._start_func_blocking = blocking |
||||
|
|
||||
|
if self.mirrors: |
||||
|
self.logger.info('One URL and {} mirrors are loaded.'.format(len(self.mirrors))) |
||||
|
else: |
||||
|
self.logger.info('One URL is loaded.') |
||||
|
|
||||
|
if self.verify_hash and os.path.exists(self.dest): |
||||
|
if utils.get_file_hash(self.hash_algorithm, self.dest) == self.hash_code: |
||||
|
self.logger.info("Destination '%s' already exists, and the hash matches. No need to download." % self.dest) |
||||
|
self.status = 'finished' |
||||
|
return |
||||
|
|
||||
|
self.logger.info("Downloading '{}' to '{}'...".format(self.url, self.dest)) |
||||
|
req = urllib.request.Request(self.url, **self.requestArgs) |
||||
|
try: |
||||
|
urlObj = urllib.request.urlopen(req, timeout=self.timeout, context=self.context) |
||||
|
except (urllib.error.HTTPError, urllib.error.URLError, socket.timeout) as e: |
||||
|
self.errors.append(e) |
||||
|
if self.mirrors: |
||||
|
self.logger.info("{} Trying next mirror...".format(str(e))) |
||||
|
self.url = self.mirrors.pop(0) |
||||
|
self.logger.info('Using url "{}"'.format(self.url)) |
||||
|
self.start(blocking) |
||||
|
return |
||||
|
else: |
||||
|
self.logger.warning(str(e)) |
||||
|
self.errors.append(e) |
||||
|
self._failed = True |
||||
|
self.status = "finished" |
||||
|
raise |
||||
|
|
||||
|
try: |
||||
|
self.filesize = int(urlObj.headers["Content-Length"]) |
||||
|
self.logger.info("Content-Length is {} ({}).".format(self.filesize, utils.sizeof_human(self.filesize))) |
||||
|
except (IndexError, KeyError, TypeError): |
||||
|
self.logger.warning("Server did not send Content-Length. Filesize is unknown.") |
||||
|
self.filesize = 0 |
||||
|
|
||||
|
args = utils.calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile) |
||||
|
bytes_per_thread = args[0][1] - args[0][0] + 1 |
||||
|
if len(args)>1: |
||||
|
self.logger.info("Launching {} threads (downloads {}/thread).".format(len(args), utils.sizeof_human(bytes_per_thread))) |
||||
|
else: |
||||
|
self.logger.info("Launching 1 thread (downloads {}).".format(utils.sizeof_human(bytes_per_thread))) |
||||
|
|
||||
|
self.status = "downloading" |
||||
|
|
||||
|
for i, arg in enumerate(args): |
||||
|
req = self.pool.submit( |
||||
|
download, |
||||
|
self.url, |
||||
|
self.dest+".%.3d" % i, |
||||
|
self.requestArgs, |
||||
|
self.context, |
||||
|
arg[0], |
||||
|
arg[1], |
||||
|
self.timeout, |
||||
|
self.shared_var, |
||||
|
self.thread_shared_cmds, |
||||
|
self.logger |
||||
|
) |
||||
|
|
||||
|
self.post_threadpool_thread = threading.Thread( |
||||
|
target=post_threadpool_actions, |
||||
|
args=( |
||||
|
self.pool, |
||||
|
[[(self.dest+".%.3d" % i) for i in range(len(args))], self.dest], |
||||
|
self.filesize, |
||||
|
self |
||||
|
) |
||||
|
) |
||||
|
self.post_threadpool_thread.daemon = True |
||||
|
self.post_threadpool_thread.start() |
||||
|
|
||||
|
self.control_thread = ControlThread(self) |
||||
|
|
||||
|
if blocking: |
||||
|
self.wait(raise_exceptions=True) |
||||
|
|
||||
|
def _exc_callback(self, req, e): |
||||
|
self.errors.append(e[0]) |
||||
|
self.logger.exception(e[1]) |
||||
|
|
||||
|
def retry(self, eStr=""): |
||||
|
if self.current_attemp < self.attemps_limit: |
||||
|
self.current_attemp += 1 |
||||
|
self.status = "ready" |
||||
|
self.shared_var.value = 0 |
||||
|
self.thread_shared_cmds = {} |
||||
|
self.start() |
||||
|
|
||||
|
else: |
||||
|
s = 'The maximum retry attempts reached' |
||||
|
if eStr: |
||||
|
s += " ({})".format(eStr) |
||||
|
self.errors.append(urllib.error.HTTPError(self.url, "0", s, {}, StringIO())) |
||||
|
self._failed = True |
||||
|
|
||||
|
def try_next_mirror(self, e=None): |
||||
|
if self.mirrors: |
||||
|
if e: |
||||
|
self.errors.append(e) |
||||
|
self.status = "ready" |
||||
|
self.shared_var.value = 0 |
||||
|
self.url = self.mirrors.pop(0) |
||||
|
self.logger.info('Using url "{}"'.format(self.url)) |
||||
|
self.start() |
||||
|
else: |
||||
|
self._failed = True |
||||
|
self.errors.append(e) |
||||
|
|
||||
|
def get_eta(self, human=False): |
||||
|
''' |
||||
|
Get estimated time of download completion, in seconds. Returns `0` if there is |
||||
|
no enough data to calculate the estimated time (this will happen on the approx. |
||||
|
first 5 seconds of each download). |
||||
|
|
||||
|
:param human: If true, returns a human-readable formatted string. Else, returns an int type number |
||||
|
:type human: bool |
||||
|
:rtype: int/string |
||||
|
''' |
||||
|
if human: |
||||
|
s = utils.time_human(self.control_thread.get_eta()) |
||||
|
return s if s else "TBD" |
||||
|
return self.control_thread.get_eta() |
||||
|
|
||||
|
def get_speed(self, human=False): |
||||
|
''' |
||||
|
Get current transfer speed in bytes per second. |
||||
|
|
||||
|
:param human: If true, returns a human-readable formatted string. Else, returns an int type number |
||||
|
:type human: bool |
||||
|
:rtype: int/string |
||||
|
''' |
||||
|
if human: |
||||
|
return "{}/s".format(utils.sizeof_human(self.control_thread.get_speed())) |
||||
|
return self.control_thread.get_speed() |
||||
|
|
||||
|
def get_progress(self): |
||||
|
''' |
||||
|
Returns the current progress of the download, as a float between `0` and `1`. |
||||
|
|
||||
|
:rtype: float |
||||
|
''' |
||||
|
if not self.filesize: |
||||
|
return 0 |
||||
|
if self.control_thread.get_dl_size() <= self.filesize: |
||||
|
return 1.0*self.control_thread.get_dl_size()/self.filesize |
||||
|
return 1.0 |
||||
|
|
||||
|
def get_progress_bar(self, length=20): |
||||
|
''' |
||||
|
Returns the current progress of the download as a string containing a progress bar. |
||||
|
|
||||
|
.. NOTE:: |
||||
|
That's an alias for pySmartDL.utils.progress_bar(obj.get_progress()). |
||||
|
|
||||
|
:param length: The length of the progress bar in chars. Default is 20. |
||||
|
:type length: int |
||||
|
:rtype: string |
||||
|
''' |
||||
|
return utils.progress_bar(self.get_progress(), length) |
||||
|
|
||||
|
def isFinished(self): |
||||
|
''' |
||||
|
Returns if the task is finished. |
||||
|
|
||||
|
:rtype: bool |
||||
|
''' |
||||
|
if self.status == "ready": |
||||
|
return False |
||||
|
if self.status == "finished": |
||||
|
return True |
||||
|
return not self.post_threadpool_thread.is_alive() |
||||
|
|
||||
|
def isSuccessful(self): |
||||
|
''' |
||||
|
Returns if the download is successfull. It may fail in the following scenarios: |
||||
|
|
||||
|
- Hash check is enabled and fails. |
||||
|
- All mirrors are down. |
||||
|
- Any local I/O problems (such as `no disk space available`). |
||||
|
|
||||
|
.. NOTE:: |
||||
|
Call `get_errors()` to get the exceptions, if any. |
||||
|
|
||||
|
Will raise `RuntimeError` if it's called when the download task is not finished yet. |
||||
|
|
||||
|
:rtype: bool |
||||
|
''' |
||||
|
|
||||
|
if self._killed: |
||||
|
return False |
||||
|
|
||||
|
n = 0 |
||||
|
while self.status != 'finished': |
||||
|
n += 1 |
||||
|
time.sleep(0.1) |
||||
|
if n >= 15: |
||||
|
raise RuntimeError("The download task must be finished in order to see if it's successful. (current status is {})".format(self.status)) |
||||
|
|
||||
|
return not self._failed |
||||
|
|
||||
|
def get_errors(self): |
||||
|
''' |
||||
|
Get errors happened while downloading. |
||||
|
|
||||
|
:rtype: list of `Exception` instances |
||||
|
''' |
||||
|
return self.errors |
||||
|
|
||||
|
def get_status(self): |
||||
|
''' |
||||
|
Returns the current status of the task. Possible values: *ready*, |
||||
|
*downloading*, *paused*, *combining*, *finished*. |
||||
|
|
||||
|
:rtype: string |
||||
|
''' |
||||
|
return self.status |
||||
|
|
||||
|
def wait(self, raise_exceptions=False): |
||||
|
''' |
||||
|
Blocks until the download is finished. |
||||
|
|
||||
|
:param raise_exceptions: If true, this function will raise exceptions. Default is *False*. |
||||
|
:type raise_exceptions: bool |
||||
|
''' |
||||
|
if self.status in ["ready", "finished"]: |
||||
|
return |
||||
|
|
||||
|
while not self.isFinished(): |
||||
|
time.sleep(0.1) |
||||
|
self.post_threadpool_thread.join() |
||||
|
self.control_thread.join() |
||||
|
|
||||
|
if self._failed and raise_exceptions: |
||||
|
raise self.errors[-1] |
||||
|
|
||||
|
def stop(self): |
||||
|
''' |
||||
|
Stops the download. |
||||
|
''' |
||||
|
if self.status == "downloading": |
||||
|
self.thread_shared_cmds['stop'] = "" |
||||
|
self._killed = True |
||||
|
|
||||
|
def pause(self): |
||||
|
''' |
||||
|
Pauses the download. |
||||
|
''' |
||||
|
if self.status == "downloading": |
||||
|
self.status = "paused" |
||||
|
self.thread_shared_cmds['pause'] = "" |
||||
|
|
||||
|
def resume(self): |
||||
|
''' |
||||
|
Continues the download. same as unpause(). |
||||
|
''' |
||||
|
self.unpause() |
||||
|
|
||||
|
def unpause(self): |
||||
|
''' |
||||
|
Continues the download. same as resume(). |
||||
|
''' |
||||
|
if self.status == "paused" and 'pause' in self.thread_shared_cmds: |
||||
|
self.status = "downloading" |
||||
|
del self.thread_shared_cmds['pause'] |
||||
|
|
||||
|
def limit_speed(self, speed): |
||||
|
''' |
||||
|
Limits the download transfer speed. |
||||
|
|
||||
|
:param speed: Speed in bytes per download per second. Negative values will not limit the speed. Default is `-1`. |
||||
|
:type speed: int |
||||
|
''' |
||||
|
if self.status == "downloading": |
||||
|
if speed == 0: |
||||
|
self.pause() |
||||
|
else: |
||||
|
self.unpause() |
||||
|
|
||||
|
if speed > 0: |
||||
|
self.thread_shared_cmds['limit'] = speed/self.threads_count |
||||
|
elif 'limit' in self.thread_shared_cmds: |
||||
|
del self.thread_shared_cmds['limit'] |
||||
|
|
||||
|
def get_dest(self): |
||||
|
''' |
||||
|
Get the destination path of the downloaded file. Needed when no |
||||
|
destination is provided to the class, and exists on a temp folder. |
||||
|
|
||||
|
:rtype: string |
||||
|
''' |
||||
|
return self.dest |
||||
|
def get_dl_time(self, human=False): |
||||
|
''' |
||||
|
Returns how much time did the download take, in seconds. Returns |
||||
|
`-1` if the download task is not finished yet. |
||||
|
|
||||
|
:param human: If true, returns a human-readable formatted string. Else, returns an int type number |
||||
|
:type human: bool |
||||
|
:rtype: int/string |
||||
|
''' |
||||
|
if not self.control_thread: |
||||
|
return 0 |
||||
|
if human: |
||||
|
return utils.time_human(self.control_thread.get_dl_time()) |
||||
|
return self.control_thread.get_dl_time() |
||||
|
|
||||
|
def get_dl_size(self, human=False): |
||||
|
''' |
||||
|
Get downloaded bytes counter in bytes. |
||||
|
|
||||
|
:param human: If true, returns a human-readable formatted string. Else, returns an int type number |
||||
|
:type human: bool |
||||
|
:rtype: int/string |
||||
|
''' |
||||
|
if not self.control_thread: |
||||
|
return 0 |
||||
|
if human: |
||||
|
return utils.sizeof_human(self.control_thread.get_dl_size()) |
||||
|
return self.control_thread.get_dl_size() |
||||
|
|
||||
|
def get_final_filesize(self, human=False): |
||||
|
''' |
||||
|
Get total download size in bytes. |
||||
|
|
||||
|
:param human: If true, returns a human-readable formatted string. Else, returns an int type number |
||||
|
:type human: bool |
||||
|
:rtype: int/string |
||||
|
''' |
||||
|
if not self.control_thread: |
||||
|
return 0 |
||||
|
if human: |
||||
|
return utils.sizeof_human(self.control_thread.get_final_filesize()) |
||||
|
return self.control_thread.get_final_filesize() |
||||
|
|
||||
|
|
||||
|
def get_data(self, binary=False, bytes=-1): |
||||
|
''' |
||||
|
Returns the downloaded data. Will raise `RuntimeError` if it's |
||||
|
called when the download task is not finished yet. |
||||
|
|
||||
|
:param binary: If true, will read the data as binary. Else, will read it as text. |
||||
|
:type binary: bool |
||||
|
:param bytes: Number of bytes to read. Negative values will read until EOF. Default is `-1`. |
||||
|
:type bytes: int |
||||
|
:rtype: string |
||||
|
''' |
||||
|
if self.status != 'finished': |
||||
|
raise RuntimeError("The download task must be finished in order to read the data. (current status is %s)" % self.status) |
||||
|
|
||||
|
flags = 'rb' if binary else 'r' |
||||
|
with open(self.get_dest(), flags) as f: |
||||
|
data = f.read(bytes) if bytes>0 else f.read() |
||||
|
return data |
||||
|
|
||||
|
def get_data_hash(self, algorithm): |
||||
|
''' |
||||
|
Returns the downloaded data's hash. Will raise `RuntimeError` if it's |
||||
|
called when the download task is not finished yet. |
||||
|
|
||||
|
:param algorithm: Hashing algorithm. |
||||
|
:type algorithm: bool |
||||
|
:rtype: string |
||||
|
|
||||
|
.. WARNING:: |
||||
|
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_. |
||||
|
''' |
||||
|
return hashlib.new(algorithm, self.get_data(binary=True)).hexdigest() |
||||
|
|
||||
|
def get_json(self): |
||||
|
''' |
||||
|
Returns the JSON in the downloaded data. Will raise `RuntimeError` if it's |
||||
|
called when the download task is not finished yet. Will raise `json.decoder.JSONDecodeError` |
||||
|
if the downloaded data is not valid JSON. |
||||
|
|
||||
|
:rtype: dict |
||||
|
''' |
||||
|
data = self.get_data() |
||||
|
return json.loads(data) |
||||
|
|
||||
|
def post_threadpool_actions(pool, args, expected_filesize, SmartDLObj): |
||||
|
"Run function after thread pool is done. Run this in a thread." |
||||
|
while not pool.done(): |
||||
|
time.sleep(0.1) |
||||
|
|
||||
|
if SmartDLObj._killed: |
||||
|
return |
||||
|
|
||||
|
if pool.get_exception(): |
||||
|
for exc in pool.get_exceptions(): |
||||
|
SmartDLObj.logger.exception(exc) |
||||
|
|
||||
|
SmartDLObj.retry(str(pool.get_exception())) |
||||
|
|
||||
|
if SmartDLObj._failed: |
||||
|
SmartDLObj.logger.warning("Task had errors. Exiting...") |
||||
|
return |
||||
|
|
||||
|
if expected_filesize: # if not zero, expected filesize is known |
||||
|
threads = len(args[0]) |
||||
|
total_filesize = sum([os.path.getsize(x) for x in args[0]]) |
||||
|
diff = math.fabs(expected_filesize - total_filesize) |
||||
|
|
||||
|
# if the difference is more than 4*thread numbers (because a thread may download 4KB extra per thread because of NTFS's block size) |
||||
|
if diff > 4*1024*threads: |
||||
|
errMsg = 'Diff between downloaded files and expected filesizes is {}B (filesize: {}, expected_filesize: {}, {} threads).'.format(total_filesize, expected_filesize, diff, threads) |
||||
|
SmartDLObj.logger.warning(errMsg) |
||||
|
SmartDLObj.retry(errMsg) |
||||
|
return |
||||
|
|
||||
|
SmartDLObj.status = "combining" |
||||
|
utils.combine_files(*args) |
||||
|
|
||||
|
if SmartDLObj.verify_hash: |
||||
|
dest_path = args[-1] |
||||
|
hash_ = utils.get_file_hash(SmartDLObj.hash_algorithm, dest_path) |
||||
|
|
||||
|
if hash_ == SmartDLObj.hash_code: |
||||
|
SmartDLObj.logger.info('Hash verification succeeded.') |
||||
|
else: |
||||
|
SmartDLObj.logger.warning('Hash verification failed.') |
||||
|
SmartDLObj.try_next_mirror(HashFailedException(os.path.basename(dest_path), hash, SmartDLObj.hash_code)) |
@ -0,0 +1,370 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
''' |
||||
|
The Utils class contains many functions for project-wide use. |
||||
|
''' |
||||
|
|
||||
|
import os |
||||
|
import sys |
||||
|
import urllib.request, urllib.parse, urllib.error |
||||
|
import random |
||||
|
import logging |
||||
|
import re |
||||
|
import hashlib |
||||
|
from concurrent import futures |
||||
|
from math import log, ceil |
||||
|
import shutil |
||||
|
|
||||
|
DEFAULT_LOGGER_CREATED = False |
||||
|
|
||||
|
def combine_files(parts, dest, chunkSize = 1024 * 1024 * 4): |
||||
|
''' |
||||
|
Combines files. |
||||
|
|
||||
|
:param parts: Source files. |
||||
|
:type parts: list of strings |
||||
|
:param dest: Destination file. |
||||
|
:type dest: string |
||||
|
:param chunkSize: Fetching chunk size. |
||||
|
:type chunkSize: int |
||||
|
|
||||
|
''' |
||||
|
if len(parts) == 1: |
||||
|
shutil.move(parts[0], dest) |
||||
|
else: |
||||
|
with open(dest, 'wb') as output: |
||||
|
for part in parts: |
||||
|
with open(part, 'rb') as input: |
||||
|
data = input.read(chunkSize) |
||||
|
while data: |
||||
|
output.write(data) |
||||
|
data = input.read(chunkSize) |
||||
|
os.remove(part) |
||||
|
|
||||
|
def url_fix(s, charset='utf-8'): |
||||
|
''' |
||||
|
Sometimes you get an URL by a user that just isn't a real |
||||
|
URL because it contains unsafe characters like ' ' and so on. This |
||||
|
function can fix some of the problems in a similar way browsers |
||||
|
handle data entered by the user: |
||||
|
|
||||
|
>>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') |
||||
|
'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' |
||||
|
|
||||
|
:param s: Url address. |
||||
|
:type s: string |
||||
|
:param charset: The target charset for the URL if the url was |
||||
|
given as unicode string. Default is 'utf-8'. |
||||
|
:type charset: string |
||||
|
:rtype: string |
||||
|
|
||||
|
(taken from `werkzeug.utils <http://werkzeug.pocoo.org/docs/utils/>`_) |
||||
|
''' |
||||
|
scheme, netloc, path, qs, anchor = urllib.parse.urlsplit(s) |
||||
|
path = urllib.parse.quote(path, '/%') |
||||
|
qs = urllib.parse.quote_plus(qs, ':&%=') |
||||
|
return urllib.parse.urlunsplit((scheme, netloc, path, qs, anchor)) |
||||
|
|
||||
|
def progress_bar(progress, length=20): |
||||
|
''' |
||||
|
Returns a textual progress bar. |
||||
|
|
||||
|
>>> progress_bar(0.6) |
||||
|
'[##########--------]' |
||||
|
|
||||
|
:param progress: Number between 0 and 1 describes the progress. |
||||
|
:type progress: float |
||||
|
:param length: The length of the progress bar in chars. Default is 20. |
||||
|
:type length: int |
||||
|
:rtype: string |
||||
|
''' |
||||
|
length -= 2 # The brackets are 2 chars long. |
||||
|
if progress < 0: |
||||
|
progress = 0 |
||||
|
if progress > 1: |
||||
|
progress = 1 |
||||
|
return "[" + "#"*int(progress*length) + "-"*(length-int(progress*length)) + "]" |
||||
|
|
||||
|
def is_HTTPRange_supported(url, timeout=15): |
||||
|
''' |
||||
|
Checks if a server allows `Byte serving <https://en.wikipedia.org/wiki/Byte_serving>`_, |
||||
|
using the Range HTTP request header and the Accept-Ranges and Content-Range HTTP response headers. |
||||
|
|
||||
|
:param url: Url address. |
||||
|
:type url: string |
||||
|
:param timeout: Timeout in seconds. Default is 15. |
||||
|
:type timeout: int |
||||
|
:rtype: bool |
||||
|
''' |
||||
|
url = url.replace(' ', '%20') |
||||
|
|
||||
|
fullsize = get_filesize(url, timeout=timeout) |
||||
|
if not fullsize: |
||||
|
return False |
||||
|
|
||||
|
headers = {'Range': 'bytes=0-3'} |
||||
|
req = urllib.request.Request(url, headers=headers) |
||||
|
urlObj = urllib.request.urlopen(req, timeout=timeout) |
||||
|
urlObj.close() |
||||
|
|
||||
|
if "Content-Length" not in urlObj.headers: |
||||
|
return False |
||||
|
|
||||
|
filesize = int(urlObj.headers["Content-Length"]) |
||||
|
return filesize != fullsize |
||||
|
|
||||
|
def get_filesize(url, timeout=15): |
||||
|
''' |
||||
|
Fetches file's size of a file over HTTP. |
||||
|
|
||||
|
:param url: Url address. |
||||
|
:type url: string |
||||
|
:param timeout: Timeout in seconds. Default is 15. |
||||
|
:type timeout: int |
||||
|
:returns: Size in bytes. |
||||
|
:rtype: int |
||||
|
''' |
||||
|
try: |
||||
|
urlObj = urllib.request.urlopen(url, timeout=timeout) |
||||
|
file_size = int(urlObj.headers["Content-Length"]) |
||||
|
except (IndexError, KeyError, TypeError, urllib.error.HTTPError, urllib.error.URLError): |
||||
|
return 0 |
||||
|
|
||||
|
return file_size |
||||
|
|
||||
|
def get_random_useragent(): |
||||
|
''' |
||||
|
Returns a random popular user-agent. |
||||
|
Taken from `here <http://techblog.willshouse.com/2012/01/03/most-common-user-agents/>`_, last updated on 2020/09/19. |
||||
|
|
||||
|
:returns: user-agent |
||||
|
:rtype: string |
||||
|
''' |
||||
|
l = [ |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36", |
||||
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:80.0) Gecko/20100101 Firefox/80.0", |
||||
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44", |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 Edg/85.0.564.51", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15", |
||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", |
||||
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36", |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15" |
||||
|
] |
||||
|
return random.choice(l) |
||||
|
|
||||
|
def sizeof_human(num): |
||||
|
''' |
||||
|
Human-readable formatting for filesizes. Taken from `here <http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size>`_. |
||||
|
|
||||
|
>>> sizeof_human(175799789) |
||||
|
'167.7 MB' |
||||
|
|
||||
|
:param num: Size in bytes. |
||||
|
:type num: int |
||||
|
|
||||
|
:rtype: string |
||||
|
''' |
||||
|
unit_list = list(zip(['B', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])) |
||||
|
|
||||
|
if num > 1: |
||||
|
exponent = min(int(log(num, 1024)), len(unit_list) - 1) |
||||
|
quotient = float(num) / 1024**exponent |
||||
|
unit, num_decimals = unit_list[exponent] |
||||
|
|
||||
|
format_string = '{:,.%sf} {}' % (num_decimals) |
||||
|
return format_string.format(quotient, unit) |
||||
|
|
||||
|
if num == 0: |
||||
|
return '0 bytes' |
||||
|
if num == 1: |
||||
|
return '1 byte' |
||||
|
|
||||
|
def time_human(duration, fmt_short=False, show_ms=False): |
||||
|
''' |
||||
|
Human-readable formatting for timing. Based on code from `here <http://stackoverflow.com/questions/6574329/how-can-i-produce-a-human-readable-difference-when-subtracting-two-unix-timestam>`_. |
||||
|
|
||||
|
>>> time_human(175799789) |
||||
|
'6 years, 2 weeks, 4 days, 17 hours, 16 minutes, 29 seconds' |
||||
|
>>> time_human(589, fmt_short=True) |
||||
|
'9m49s' |
||||
|
|
||||
|
:param duration: Duration in seconds. |
||||
|
:type duration: int/float |
||||
|
:param fmt_short: Format as a short string (`47s` instead of `47 seconds`) |
||||
|
:type fmt_short: bool |
||||
|
:param show_ms: Specify milliseconds in the string. |
||||
|
:type show_ms: bool |
||||
|
:rtype: string |
||||
|
''' |
||||
|
ms = int(duration % 1 * 1000) |
||||
|
duration = int(duration) |
||||
|
if duration == 0 and (not show_ms or ms == 0): |
||||
|
return "0s" if fmt_short else "0 seconds" |
||||
|
|
||||
|
INTERVALS = [1, 60, 3600, 86400, 604800, 2419200, 29030400] |
||||
|
if fmt_short: |
||||
|
NAMES = ['s'*2, 'm'*2, 'h'*2, 'd'*2, 'w'*2, 'y'*2] |
||||
|
else: |
||||
|
NAMES = [ |
||||
|
('second', 'seconds'), |
||||
|
('minute', 'minutes'), |
||||
|
('hour', 'hours'), |
||||
|
('day', 'days'), |
||||
|
('week', 'weeks'), |
||||
|
('month', 'months'), |
||||
|
('year', 'years') |
||||
|
] |
||||
|
|
||||
|
result = [] |
||||
|
|
||||
|
for i in range(len(NAMES)-1, -1, -1): |
||||
|
a = duration // INTERVALS[i] |
||||
|
if a > 0: |
||||
|
result.append( (a, NAMES[i][1 % a]) ) |
||||
|
duration -= a * INTERVALS[i] |
||||
|
|
||||
|
if show_ms and ms > 0: |
||||
|
result.append((ms, "ms" if fmt_short else "milliseconds")) |
||||
|
|
||||
|
if fmt_short: |
||||
|
return "".join(["%s%s" % x for x in result]) |
||||
|
return ", ".join(["%s %s" % x for x in result]) |
||||
|
|
||||
|
def get_file_hash(algorithm, path): |
||||
|
''' |
||||
|
Calculates a file's hash. |
||||
|
|
||||
|
.. WARNING:: |
||||
|
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_. |
||||
|
|
||||
|
:param algorithm: Hashing algorithm. |
||||
|
:type algorithm: string |
||||
|
:param path: The file path |
||||
|
:type path: string |
||||
|
:rtype: string |
||||
|
''' |
||||
|
hashAlg = hashlib.new(algorithm) |
||||
|
block_sz = 1*1024**2 # 1 MB |
||||
|
|
||||
|
with open(path, 'rb') as f: |
||||
|
data = f.read(block_sz) |
||||
|
while data: |
||||
|
hashAlg.update(data) |
||||
|
data = f.read(block_sz) |
||||
|
|
||||
|
return hashAlg.hexdigest() |
||||
|
|
||||
|
def calc_chunk_size(filesize, threads, minChunkFile): |
||||
|
''' |
||||
|
Calculates the byte chunks to download. |
||||
|
|
||||
|
:param filesize: filesize in bytes. |
||||
|
:type filesize: int |
||||
|
:param threads: Number of trheads |
||||
|
:type threads: int |
||||
|
:param minChunkFile: Minimum chunk size |
||||
|
:type minChunkFile: int |
||||
|
:rtype: Array of (startByte,endByte) tuples |
||||
|
''' |
||||
|
if not filesize: |
||||
|
return [(0, 0)] |
||||
|
|
||||
|
while ceil(filesize/threads) < minChunkFile and threads > 1: |
||||
|
threads -= 1 |
||||
|
|
||||
|
args = [] |
||||
|
pos = 0 |
||||
|
chunk = ceil(filesize/threads) |
||||
|
for i in range(threads): |
||||
|
startByte = pos |
||||
|
endByte = pos + chunk |
||||
|
if endByte > filesize-1: |
||||
|
endByte = filesize-1 |
||||
|
args.append((startByte, endByte)) |
||||
|
pos += chunk+1 |
||||
|
|
||||
|
return args |
||||
|
|
||||
|
def create_debugging_logger(): |
||||
|
''' |
||||
|
Creates a debugging logger that prints to console. |
||||
|
|
||||
|
:rtype: `logging.Logger` instance |
||||
|
''' |
||||
|
global DEFAULT_LOGGER_CREATED |
||||
|
|
||||
|
t_log = logging.getLogger('pySmartDL') |
||||
|
|
||||
|
if not DEFAULT_LOGGER_CREATED: |
||||
|
t_log.setLevel(logging.DEBUG) |
||||
|
console = logging.StreamHandler() |
||||
|
console.setLevel(logging.DEBUG) |
||||
|
console.setFormatter(logging.Formatter('[%(levelname)s||%(thread)d@{%(pathname)s:%(lineno)d}] %(message)s')) |
||||
|
t_log.addHandler(console) |
||||
|
DEFAULT_LOGGER_CREATED = True |
||||
|
|
||||
|
return t_log |
||||
|
|
||||
|
class DummyLogger(object): |
||||
|
''' |
||||
|
A dummy logger. You can call `debug()`, `warning()`, etc on this object, and nothing will happen. |
||||
|
''' |
||||
|
def __init__(self): |
||||
|
pass |
||||
|
|
||||
|
def dummy_func(self, *args, **kargs): |
||||
|
pass |
||||
|
|
||||
|
def __getattr__(self, name): |
||||
|
if name.startswith('__'): |
||||
|
return object.__getattr__(name) |
||||
|
return self.dummy_func |
||||
|
|
||||
|
class ManagedThreadPoolExecutor(futures.ThreadPoolExecutor): |
||||
|
''' |
||||
|
Managed Thread Pool Executor. A subclass of ThreadPoolExecutor. |
||||
|
''' |
||||
|
def __init__(self, max_workers): |
||||
|
futures.ThreadPoolExecutor.__init__(self, max_workers) |
||||
|
self._futures = [] |
||||
|
|
||||
|
def submit(self, fn, *args, **kwargs): |
||||
|
future = super().submit(fn, *args, **kwargs) |
||||
|
self._futures.append(future) |
||||
|
return future |
||||
|
|
||||
|
def done(self): |
||||
|
return all([x.done() for x in self._futures]) |
||||
|
|
||||
|
def get_exceptions(self): |
||||
|
''' |
||||
|
Return all the exceptions raised. |
||||
|
|
||||
|
:rtype: List of `Exception` instances''' |
||||
|
l = [] |
||||
|
for x in self._futures: |
||||
|
if x.exception(): |
||||
|
l.append(x.exception()) |
||||
|
return l |
||||
|
|
||||
|
def get_exception(self): |
||||
|
''' |
||||
|
Returns only the first exception. Returns None if no exception was raised. |
||||
|
|
||||
|
:rtype: `Exception` instance |
||||
|
''' |
||||
|
for x in self._futures: |
||||
|
if x.exception(): |
||||
|
return x.exception() |
||||
|
return None |
Loading…
Reference in new issue