You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

371 lines
12 KiB

3 years ago
# -*- coding: utf-8 -*-
'''
The Utils class contains many functions for project-wide use.
'''
import os
import sys
import urllib.request, urllib.parse, urllib.error
import random
import logging
import re
import hashlib
from concurrent import futures
from math import log, ceil
import shutil
DEFAULT_LOGGER_CREATED = False
def combine_files(parts, dest, chunkSize = 1024 * 1024 * 4):
'''
Combines files.
:param parts: Source files.
:type parts: list of strings
:param dest: Destination file.
:type dest: string
:param chunkSize: Fetching chunk size.
:type chunkSize: int
'''
if len(parts) == 1:
shutil.move(parts[0], dest)
else:
with open(dest, 'wb') as output:
for part in parts:
with open(part, 'rb') as input:
data = input.read(chunkSize)
while data:
output.write(data)
data = input.read(chunkSize)
os.remove(part)
def url_fix(s, charset='utf-8'):
'''
Sometimes you get an URL by a user that just isn't a real
URL because it contains unsafe characters like ' ' and so on. This
function can fix some of the problems in a similar way browsers
handle data entered by the user:
>>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
:param s: Url address.
:type s: string
:param charset: The target charset for the URL if the url was
given as unicode string. Default is 'utf-8'.
:type charset: string
:rtype: string
(taken from `werkzeug.utils <http://werkzeug.pocoo.org/docs/utils/>`_)
'''
scheme, netloc, path, qs, anchor = urllib.parse.urlsplit(s)
path = urllib.parse.quote(path, '/%')
qs = urllib.parse.quote_plus(qs, ':&%=')
return urllib.parse.urlunsplit((scheme, netloc, path, qs, anchor))
def progress_bar(progress, length=20):
'''
Returns a textual progress bar.
>>> progress_bar(0.6)
'[##########--------]'
:param progress: Number between 0 and 1 describes the progress.
:type progress: float
:param length: The length of the progress bar in chars. Default is 20.
:type length: int
:rtype: string
'''
length -= 2 # The brackets are 2 chars long.
if progress < 0:
progress = 0
if progress > 1:
progress = 1
return "[" + "#"*int(progress*length) + "-"*(length-int(progress*length)) + "]"
def is_HTTPRange_supported(url, timeout=15):
'''
Checks if a server allows `Byte serving <https://en.wikipedia.org/wiki/Byte_serving>`_,
using the Range HTTP request header and the Accept-Ranges and Content-Range HTTP response headers.
:param url: Url address.
:type url: string
:param timeout: Timeout in seconds. Default is 15.
:type timeout: int
:rtype: bool
'''
url = url.replace(' ', '%20')
fullsize = get_filesize(url, timeout=timeout)
if not fullsize:
return False
headers = {'Range': 'bytes=0-3'}
req = urllib.request.Request(url, headers=headers)
urlObj = urllib.request.urlopen(req, timeout=timeout)
urlObj.close()
if "Content-Length" not in urlObj.headers:
return False
filesize = int(urlObj.headers["Content-Length"])
return filesize != fullsize
def get_filesize(url, timeout=15):
'''
Fetches file's size of a file over HTTP.
:param url: Url address.
:type url: string
:param timeout: Timeout in seconds. Default is 15.
:type timeout: int
:returns: Size in bytes.
:rtype: int
'''
try:
urlObj = urllib.request.urlopen(url, timeout=timeout)
file_size = int(urlObj.headers["Content-Length"])
except (IndexError, KeyError, TypeError, urllib.error.HTTPError, urllib.error.URLError):
return 0
return file_size
def get_random_useragent():
'''
Returns a random popular user-agent.
Taken from `here <http://techblog.willshouse.com/2012/01/03/most-common-user-agents/>`_, last updated on 2020/09/19.
:returns: user-agent
:rtype: string
'''
l = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:80.0) Gecko/20100101 Firefox/80.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 Edg/85.0.564.51",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"
]
return random.choice(l)
def sizeof_human(num):
'''
Human-readable formatting for filesizes. Taken from `here <http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size>`_.
>>> sizeof_human(175799789)
'167.7 MB'
:param num: Size in bytes.
:type num: int
:rtype: string
'''
unit_list = list(zip(['B', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2]))
if num > 1:
exponent = min(int(log(num, 1024)), len(unit_list) - 1)
quotient = float(num) / 1024**exponent
unit, num_decimals = unit_list[exponent]
format_string = '{:,.%sf} {}' % (num_decimals)
return format_string.format(quotient, unit)
if num == 0:
return '0 bytes'
if num == 1:
return '1 byte'
def time_human(duration, fmt_short=False, show_ms=False):
'''
Human-readable formatting for timing. Based on code from `here <http://stackoverflow.com/questions/6574329/how-can-i-produce-a-human-readable-difference-when-subtracting-two-unix-timestam>`_.
>>> time_human(175799789)
'6 years, 2 weeks, 4 days, 17 hours, 16 minutes, 29 seconds'
>>> time_human(589, fmt_short=True)
'9m49s'
:param duration: Duration in seconds.
:type duration: int/float
:param fmt_short: Format as a short string (`47s` instead of `47 seconds`)
:type fmt_short: bool
:param show_ms: Specify milliseconds in the string.
:type show_ms: bool
:rtype: string
'''
ms = int(duration % 1 * 1000)
duration = int(duration)
if duration == 0 and (not show_ms or ms == 0):
return "0s" if fmt_short else "0 seconds"
INTERVALS = [1, 60, 3600, 86400, 604800, 2419200, 29030400]
if fmt_short:
NAMES = ['s'*2, 'm'*2, 'h'*2, 'd'*2, 'w'*2, 'y'*2]
else:
NAMES = [
('second', 'seconds'),
('minute', 'minutes'),
('hour', 'hours'),
('day', 'days'),
('week', 'weeks'),
('month', 'months'),
('year', 'years')
]
result = []
for i in range(len(NAMES)-1, -1, -1):
a = duration // INTERVALS[i]
if a > 0:
result.append( (a, NAMES[i][1 % a]) )
duration -= a * INTERVALS[i]
if show_ms and ms > 0:
result.append((ms, "ms" if fmt_short else "milliseconds"))
if fmt_short:
return "".join(["%s%s" % x for x in result])
return ", ".join(["%s %s" % x for x in result])
def get_file_hash(algorithm, path):
'''
Calculates a file's hash.
.. WARNING::
The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/3/library/hashlib.html>`_.
:param algorithm: Hashing algorithm.
:type algorithm: string
:param path: The file path
:type path: string
:rtype: string
'''
hashAlg = hashlib.new(algorithm)
block_sz = 1*1024**2 # 1 MB
with open(path, 'rb') as f:
data = f.read(block_sz)
while data:
hashAlg.update(data)
data = f.read(block_sz)
return hashAlg.hexdigest()
def calc_chunk_size(filesize, threads, minChunkFile):
'''
Calculates the byte chunks to download.
:param filesize: filesize in bytes.
:type filesize: int
:param threads: Number of trheads
:type threads: int
:param minChunkFile: Minimum chunk size
:type minChunkFile: int
:rtype: Array of (startByte,endByte) tuples
'''
if not filesize:
return [(0, 0)]
while ceil(filesize/threads) < minChunkFile and threads > 1:
threads -= 1
args = []
pos = 0
chunk = ceil(filesize/threads)
for i in range(threads):
startByte = pos
endByte = pos + chunk
if endByte > filesize-1:
endByte = filesize-1
args.append((startByte, endByte))
pos += chunk+1
return args
def create_debugging_logger():
'''
Creates a debugging logger that prints to console.
:rtype: `logging.Logger` instance
'''
global DEFAULT_LOGGER_CREATED
t_log = logging.getLogger('pySmartDL')
if not DEFAULT_LOGGER_CREATED:
t_log.setLevel(logging.DEBUG)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
console.setFormatter(logging.Formatter('[%(levelname)s||%(thread)d@{%(pathname)s:%(lineno)d}] %(message)s'))
t_log.addHandler(console)
DEFAULT_LOGGER_CREATED = True
return t_log
class DummyLogger(object):
'''
A dummy logger. You can call `debug()`, `warning()`, etc on this object, and nothing will happen.
'''
def __init__(self):
pass
def dummy_func(self, *args, **kargs):
pass
def __getattr__(self, name):
if name.startswith('__'):
return object.__getattr__(name)
return self.dummy_func
class ManagedThreadPoolExecutor(futures.ThreadPoolExecutor):
'''
Managed Thread Pool Executor. A subclass of ThreadPoolExecutor.
'''
def __init__(self, max_workers):
futures.ThreadPoolExecutor.__init__(self, max_workers)
self._futures = []
def submit(self, fn, *args, **kwargs):
future = super().submit(fn, *args, **kwargs)
self._futures.append(future)
return future
def done(self):
return all([x.done() for x in self._futures])
def get_exceptions(self):
'''
Return all the exceptions raised.
:rtype: List of `Exception` instances'''
l = []
for x in self._futures:
if x.exception():
l.append(x.exception())
return l
def get_exception(self):
'''
Returns only the first exception. Returns None if no exception was raised.
:rtype: `Exception` instance
'''
for x in self._futures:
if x.exception():
return x.exception()
return None