Source code for ymp.download

import asyncio
import atexit
import hashlib
import logging
import os
import re
import threading
from typing import List, Optional, Union
from urllib.parse import urlsplit

import aiohttp

from tqdm import tqdm

from ymp.common import ensure_list

LOG = logging.getLogger(__name__)


[docs]class FileDownloader(object): """Manages download of a set of URLs Downloads happen concurrently using asyncronous network IO. Args: block_size: Byte size of chunks to download timeout: Aiohttp cumulative timeout parallel: Number of files to download in parallel loglevel: Log level for messages send to logging (Errors are send with loglevel+10) alturls: List of regexps modifying URLs retry: Number of times to retry download """ def __init__(self, block_size: int=4096, timeout: int=300, parallel: int=4, loglevel: int=logging.WARNING, alturls=None, retry: int=3): self._block_size = block_size self._timeout = timeout self._parallel = parallel self._retry = retry self._alturls = [] alturls = ["///"] + (alturls or []) for pat in alturls: sep = pat[0] if pat.strip(sep): patsub = re.split(r"(?<=[^\\])"+sep, pat.strip(sep)) if len(patsub) != 2: raise ValueError("Malformed regular expression '{}'" "".format(pat)) patsub[1] = patsub[1].replace(r"\/", "/") else: patsub = ["", ""] self._alturls.append(patsub) try: self.loop = asyncio.get_event_loop() except RuntimeError: # no loop in context (i.e. running in thread) self.loop = asyncio.new_event_loop() asyncio.set_event_loop(self.loop) self._sem = asyncio.Semaphore(parallel) self._progress = LOG.getEffectiveLevel() <= loglevel self._loglevel = loglevel self._sum_bar = None
[docs] def log(self, msg: str, *args, modlvl: int=0, **kwargs) -> None: """Send message to logger Honors loglevel set for the FileDownloader object. Args: msg: The log message modlvl: Added to default logging level for object """ LOG.log(self._loglevel + modlvl, msg, *args, **kwargs)
[docs] def error(self, msg: str, *args, **kwargs) -> None: """Send error to logger Message is sent with a log level 10 higher than the default for this object. """ self.log(msg, *args, modlvl=10, **kwargs)
[docs] @staticmethod def make_bar_format(desc_width: int=20, count_width: int=0, rate: bool=False, eta: bool=False, have_total: bool=True) -> str: """Construct bar_format for tqdm Args: desc_width: minimum space allocated for description count_width: min space for counts rate: show rate to right of progress bar eta: show eta to right of progress bar have_total: whether a total exists (required to add percentage) """ if have_total: left = '{{desc:<{dw}}} {{percentage:3.0f}}%'.format(dw=desc_width) else: # percentage not supplied by TQDM if there is no total left = '{{desc:<{dw}}}'.format(dw=desc_width) right = ' {{n_fmt:>{cw}}} / {{total_fmt:<{cw}}}'.format(cw=count_width) if rate: right += ' {{rate_fmt:>{cw}}}'.format(cw=count_width+2) if eta: right += ' ETA {remaining}' return left + '|{bar}|' + right
async def _download(self, session: aiohttp.ClientSession, url: str, dest: str, md5: Optional[str]=None) -> bool: """Asynchronously download a single file - If ``dest`` points to an existing directory, the file name is derived from the trailing path portion of the URL. - Will skip download for existing files with matching MD5 Args: session: aiohttp session object url: source url dest: destination path md5: optional md5 checksum to verify """ if os.path.isdir(dest): parts = urlsplit(url) basename = os.path.basename(parts.path) destfile = os.path.join(dest, basename) else: basename = os.path.basename(dest) destfile = dest if os.path.exists(destfile) and md5 and not isinstance(md5, bool): if self._check_md5(basename, destfile, md5): return True tryurls = [re.sub(pat, rep, url) for pat, rep in self._alturls] for url in tryurls: # try alturls exc = None for num_try in range(self._retry): # retry after timeout if exc: self.log("Downloading %s failed with %s. Retrying %i/%i", basename, exc, num_try, self._retry-1) try: if await self._download_one(session, basename, url, destfile, md5): return True break except TimeoutError as e: exc = e return False def _check_md5(self, name, fname, md5): md5_new = hashlib.md5() with open(fname, 'rb') as f: while True: block = f.read(8192) if not block: break md5_new.update(block) if md5_new.hexdigest() == md5.strip(): self.log("Download skipped: %s (file exists, md5 verified)", name) return True return False async def _download_one(self, session, name, url, dest, md5): part = dest+".part" if md5: md5_new = hashlib.md5() try: async with self._sem, \ session.get(url, timeout=self._timeout) as resp: if not resp.status == 200: self.log("Download failed: %s (error code %i)", name, resp.status) self.log(" URL: '%s'", url.strip()) return False size = int(resp.headers.get('content-length', 0)) if os.path.exists(dest): existing_size = os.path.getsize(dest) if existing_size == size: if md5: self.log("Overwriting: %s (md5 failed)", name) else: self.log("Download skipped: %s (file exists)", name) return True else: self.log("Overwriting: %s (size mismatch %i!=%i)", name, size, existing_size) try: self._sum_bar.total += size except AttributeError: pass with open(part, mode="wb") as out, \ tqdm(total=size, unit='B', unit_scale=True, unit_divisor=1024, desc=name, leave=False, miniters=1, disable=not self._progress, bar_format=self.make_bar_format(40, 7, rate=True)) as t: while True: block = await resp.content.read(self._block_size) if not block: break out.write(block) if md5: md5_new.update(block) t.update(len(block)) self._sum_bar.update(len(block)) os.rename(part, dest) if md5: md5_hash = md5_new.hexdigest() if isinstance(md5, bool): self.log("Download complete: %s (md5=%s)", name, md5_hash.strip()) elif md5.strip() == md5_hash: self.log("Download complete: %s (md5 verified)", name) else: self.error("Download failed: %s (md5 failed)", name) return False return True except (asyncio.CancelledError, asyncio.TimeoutError): if os.path.exists(part): os.unlink(part) raise async def _run(self, urls: List[str], dest: str, md5s: Optional[List[str]]=None) -> List[bool]: """Executes a download session Args: urls: List of URLs dest: Destination path md5s: Optional list of md5 checksums """ if not md5s: md5s = [None]*len(urls) async with aiohttp.ClientSession() as session: if len(urls) == 0: # No need to show progress bar for just 1 file self.log("Downloading 1 file.") result = await asyncio.ensure_future( self._download(session, urls[0], dest, md5s[0]) ) self.log("Finished download.") else: self.log("Downloading %i files.", len(urls)) coros = [ asyncio.ensure_future( self._download(session, url, dest, md5) ) for url, md5 in zip(urls, md5s) ] with tqdm( asyncio.as_completed(coros), total=len(coros), unit="Files", desc="Total files:", disable=not self._progress, leave=False, bar_format=self.make_bar_format(20, 7, eta=True) ) as t, tqdm( total=1, # must be >0 unit="B", desc="Total bytes:", unit_scale=True, unit_divisor=1024, disable=not self._progress, leave=False, miniters=1, bar_format=self.make_bar_format(20, 7, rate=True) ) as t2: self._sum_bar = t2 result = [await coro for coro in t] self.log("Finished downloads.") return result
[docs] def get(self, urls: Union[str, List[str]], dest: str, md5s: Optional[List[str]]=None) -> None: """Download a list of URLs Args: urls: List of URLs dest: Destination folder md5s: List of MD5 sums to check """ urls = ensure_list(urls) if not urls: return True # nothing to do if len(urls) > 1: if not os.path.exists(dest): os.makedirs(dest) try: task = asyncio.ensure_future(self._run(urls, dest, md5s)) self.loop.run_until_complete(task) except KeyboardInterrupt: end = asyncio.gather(*asyncio.Task.all_tasks()) end.cancel() try: self.loop.run_until_complete(end) except asyncio.CancelledError: pass raise return all(task.result())
[docs]class DownloadThread(object): def __init__(self): LOG.error("made downloader") self.loop = asyncio.new_event_loop() self.thread = threading.Thread(target=self.main) self.thread.start() atexit.register(self.terminate)
[docs] def terminate(self): self.loop.call_soon_threadsafe(self.loop.stop)
[docs] def main(self): LOG.error("here") asyncio.set_event_loop(self.loop) self.downloader = FileDownloader() self.loop.run_forever()
[docs] def get(self, url, dest, md5): LOG.error("scheduling get %s", url) self.loop.call_soon_threadsafe( self.downloader.get(url, dest, md5) )
#DOWNLOADER = DownloadThread() #def download(url, dest, md5=None): # LOG.error("called download %s", url) # DOWNLOADER.get(url, dest, md5)