Source code for reacnetgenerator.utils

# SPDX-License-Identifier: LGPL-3.0-or-later
# cython: language_level=3
# cython: linetrace=True
"""Provide utils for ReacNetGenerator."""


import asyncio
import hashlib
import itertools
import os
import pickle
import shutil
from contextlib import ExitStack
from multiprocessing import Pool, Semaphore
from typing import (
    IO,
    TYPE_CHECKING,
    Any,
    AnyStr,
    BinaryIO,
    Callable,
    Generator,
    Iterable,
    List,
    Optional,
    Tuple,
    Union,
)

import lz4.frame
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from tqdm.auto import tqdm

from ._logging import logger

if TYPE_CHECKING:
    import multiprocessing.pool
    import multiprocessing.synchronize

    import reacnetgenerator


[docs] class WriteBuffer: """Store a buffer for writing files. It is expensive to write to a file, so we need to make a buffer. Parameters ---------- f: fileObject The file object to write. linenumber: int, default: 1200 The number of contents to store in the buffer. The buffer will be flushed if it exceeds the set number. sep: str or bytes, default: None The separator for contents. If None (default), there will be no separator. """ def __init__( self, f: IO, linenumber: int = 1200, sep: Optional[AnyStr] = None ) -> None: self.f = f if sep is not None: self.sep = sep elif f.mode == "w": self.sep = "" elif f.mode == "wb": self.sep = b"" else: raise RuntimeError("File mode should be w or wb!") self.linenumber = linenumber self.buff = [] self.name = self.f.name
[docs] def append(self, text: AnyStr) -> None: """Append a text. Parameters ---------- text : str or bytes The text to be appended. """ self.buff.append(text) self.check()
[docs] def extend(self, text: Iterable[AnyStr]) -> None: """Extend texts. Parameters ---------- text : list of strs or bytes Texts to be extended. """ self.buff.extend(text) self.check()
[docs] def check(self) -> None: """Check if the number of stored contents exceeds. If so, the buffer will be flushed. """ if len(self.buff) > self.linenumber: self.flush()
[docs] def flush(self) -> None: """Flush the buffer.""" if self.buff: self.f.writelines([self.sep.join(self.buff), self.sep]) self.buff[:] = []
def __enter__(self) -> "WriteBuffer": """Enter the context.""" return self def __exit__(self, exc_type, exc_value, traceback): """Exit the context.""" self.flush() self.f.__exit__(exc_type, exc_value, traceback)
[docs] def appendIfNotNone(f: Union[WriteBuffer, ExitStack], wbytes: Optional[AnyStr]) -> None: """Append a line to a file if the line is not None. Parameters ---------- f : WriteBuffer The file to write. wbytes : str or bytes The line to write. """ if wbytes is not None: assert not isinstance(f, ExitStack) f.append(wbytes)
[docs] def produce( semaphore: "multiprocessing.synchronize.Semaphore", plist: Iterable[Any], parameter: Any, ) -> Generator[Tuple[Any, Any], None, None]: """Item producer with a semaphore. Prevent large memory usage due to slow IO. Parameters ---------- semaphore : multiprocessing.Semaphore The semaphore to acquire. plist : list of objects The list of items to be passed. parameter : object The parameter yielded with each item. Yields ------ item: object The item to be yielded. parameter: object The parameter yielded with each item. """ for item in plist: semaphore.acquire() if parameter is not None: item = (item, parameter) yield item
[docs] def compress(x: Union[str, bytes]) -> bytes: """Compress the line. This function reduces IO overhead to speed up the program. The functions will use lz4 to compress, since lz4 has better performance that any others. The compressed format is size + data + size + data + ..., where size is a 64-bit little-endian integer. Parameters ---------- x : str or bytes The line to compress. Returns ------- bytes The compressed line, with a linebreak in the end. """ if isinstance(x, str): x = x.encode() compress_block = lz4.frame.compress(x, compression_level=0) length_bytes = len(compress_block).to_bytes(64, byteorder="little") return length_bytes + compress_block
[docs] def decompress(x: bytes, isbytes: bool = False) -> Union[str, bytes]: """Decompress the line. Parameters ---------- x : bytes The line to decompress. isbytes : bool, optional, default: False If the decompressed content is bytes. If not, the line will be decoded. Returns ------- str or bytes The decompressed line. """ x = lz4.frame.decompress(x[64:]) if isbytes: return x return x.decode()
[docs] def listtobytes(x: Any) -> bytes: """Convert an object to a compressed line. Parameters ---------- x : object The object to convert, such as numpy.ndarray. Returns ------- bytes The compressed line. """ return compress(pickle.dumps(x))
[docs] def read_compressed_block(f: BinaryIO) -> Generator[bytes, None, None]: """Read compressed binary file, assuming the format is size + data + size + data + ... Parameters ---------- f : fileObject The file object to read. Yields ------ data: bytes The compressed block. """ while True: sizeb = f.read(64) if not sizeb: break size = int.from_bytes(sizeb, byteorder="little") yield sizeb + f.read(size)
[docs] def bytestolist(x: bytes) -> Any: """Convert a compressed line to an object. Parameters ---------- x : bytes The compressed line. Returns ------- object The decompressed object. """ data = decompress(x, isbytes=True) assert isinstance(data, bytes) return pickle.loads(data)
[docs] def listtostirng( l: Union[str, list, tuple, np.ndarray], sep: Union[List[str], Tuple[str, ...]] ) -> str: """Convert a list to string, that is easier to store. Parameters ---------- l : str or array-like The list to convert, which can contain any number of dimensions. sep : list of strs The seperators for each dimension. Returns ------- str The converted string. """ if isinstance(l, str): return l if isinstance(l, (list, tuple, np.ndarray)): return sep[0].join(listtostirng(x, sep[1:]) for x in l) return str(l)
[docs] def multiopen( pool: "multiprocessing.pool.Pool", func: Callable, l: IO, semaphore: Optional["multiprocessing.synchronize.Semaphore"] = None, nlines: Optional[int] = None, unordered: bool = True, return_num: bool = False, start: int = 0, extra: Optional[Any] = None, interval: Optional[int] = None, bar: bool = True, desc: Optional[str] = None, unit: str = "it", total: Optional[int] = None, ) -> Iterable: """Return an interated object for process a file with multiple processors. Parameters ---------- pool : multiprocessing.Pool The pool for multiprocessing. func : function The function to process lines. l : File object The file object. semaphore : multiprocessing.Semaphore, optional, default: None The semaphore to acquire. If None (default), the object will be passed without control. nlines : int, optional, default: None The number of lines to pass to the function each time. If None (default), only one line will be passed to the function. unordered : bool, optional, default: True Whether the process can be unordered. return_num : bool, optional, default: False If True, adds a counter to an iterable. start : int, optional, default: 0 The start number of the counter. extra : object, optional, default: None The extra object passed to the item. interval : int, optional, default: None The interval of items that will be passed to the function. For example, if set to 10, a item will be passed once every 10 items and others will be dropped. bar : bool, optional, default: True If True, show a tqdm bar for the iteration. desc : str, optional, default: None The description of the iteration shown in the bar. unit : str, optional, default: it The unit of the iteration shown in the bar. total : int, optional, default: None The total number of the iteration shown in the bar. Returns ------- object An object that can be iterated. """ obj = l if nlines: obj = itertools.zip_longest(*[obj] * nlines) if interval: obj = itertools.islice(obj, 0, None, interval) if return_num: obj = enumerate(obj, start) if semaphore: obj = produce(semaphore, obj, extra) if unordered: obj = pool.imap_unordered(func, obj, 100) else: obj = pool.imap(func, obj, 100) if bar: obj = tqdm(obj, desc=desc, unit=unit, total=total, disable=None) return obj
[docs] class SCOUROPTIONS: """Scour (SVG optimization) options.""" strip_xml_prolog = True remove_titles = True remove_descriptions = True remove_metadata = True remove_descriptive_elements = True strip_comments = True enable_viewboxing = True strip_xml_space_attribute = True strip_ids = True shorten_ids = True newlines = False
[docs] class SharedRNGData: """Share ReacNetGenerator data with a class of the submodule. Parameters ---------- rng: reacnetgenerator.ReacNetGenerator The centered ReacNetGenerator class. usedRNGKeys: list of strs Keys that needs to pass from ReacNetGenerator class to the submodule. returnedRNGKeys: list of strs Keys that needs to pass from the submodule to ReacNetGenerator class. extraNoneKeys: list of strs, optional, default: None Set keys to None, which will be used in the submodule. """ def __init__( self, rng: "reacnetgenerator.ReacNetGenerator", usedRNGKeys: List[str], returnedRNGKeys: List[str], extraNoneKeys: Optional[List[str]] = None, ) -> None: self.rng = rng self.returnedRNGKeys = returnedRNGKeys for key in usedRNGKeys: setattr(self, key, getattr(self.rng, key)) for key in returnedRNGKeys: setattr(self, key, None) if extraNoneKeys is not None: for key in extraNoneKeys: setattr(self, key, None)
[docs] def returnkeys(self) -> None: """Return back keys to ReacNetGenerator class.""" for key in self.returnedRNGKeys: setattr(self.rng, key, getattr(self, key))
[docs] def checksha256(filename: str, sha256_check: Union[str, List[str]]): """Check sha256 of a file is correct. Parameters ---------- filename : str The filename. sha256_check : str or list of strs The sha256 to be checked. Returns ------- bool Indicate whether sha256 is correct. """ if not os.path.isfile(filename): return h = hashlib.sha256() b = bytearray(128 * 1024) mv = memoryview(b) with open(filename, "rb", buffering=0) as f: for n in iter(lambda: f.readinto(mv), 0): h.update(mv[:n]) sha256 = h.hexdigest() logger.info(f"SHA256 of {filename}: {sha256}") if sha256 in must_be_list(sha256_check): return True logger.warning("SHA256 is not correct.") logger.warning(open(filename).read()) return False
[docs] async def download_file( urls: Union[str, List[str]], pathfilename: str, sha256: str ) -> str: """Download files from remote urls if not exists. Parameters ---------- urls: str or list of strs The url(s) that is available to download. pathfilename: str The downloading path of the file. sha256: str Sha256 of the file. If not None and match the file, the download will be skiped. Returns ------- pathfilename: str The downloading path of the file. """ s = requests.Session() s.mount("http://", HTTPAdapter(max_retries=3)) s.mount("https://", HTTPAdapter(max_retries=3)) # download if not exists if os.path.isfile(pathfilename) and ( sha256 is None or checksha256(pathfilename, sha256) ): return pathfilename # from https://stackoverflow.com/questions/16694907 for url in must_be_list(urls): logger.info(f"Try to download {pathfilename} from {url}") with s.get(url, stream=True) as r, open(pathfilename, "wb") as f: try: shutil.copyfileobj(r.raw, f) break except requests.exceptions.RequestException as e: logger.warning(f"Request {pathfilename} Error.", exc_info=e) else: raise RuntimeError(f"Cannot download {pathfilename}.") return pathfilename
[docs] async def gather_download_files(urls: List[dict]) -> None: """Asynchronously download files from remote urls if not exists. See download_multifiles function for details. See Also -------- download_multifiles """ await asyncio.gather( *[ download_file(jdata["url"], jdata["fn"], jdata.get("sha256", None)) for jdata in urls ] )
[docs] def download_multifiles(urls: List[dict]) -> None: """Download multiple files from dicts. Parameters ---------- urls : list of dicts The information of download files. Each dict should contain the following key: - url: str or list of strs The url(s) that is available to download. - pathfilename: str The downloading path of the file. - sha256: str, optional, default: None Sha256 of the file. If not None and match the file, the download will be skiped. """ asyncio.run(gather_download_files(urls))
[docs] def run_mp(nproc: int, **kwargs: Any) -> Iterable[Any]: """Process a file with multiple processors. Parameters ---------- nproc : int The number of processors to be used. **kwargs : dict, optional Other parameters can be found in the `multiopen` method. Yields ------ object The yielded object from the `multiopen` method. See Also -------- multiopen """ pool = Pool(nproc, maxtasksperchild=1000) semaphore = Semaphore(nproc * 150) try: results = multiopen(pool=pool, semaphore=semaphore, **kwargs) for item in results: yield item semaphore.release() except: logger.exception("run_mp failed") pool.terminate() raise else: pool.close() finally: pool.join()
[docs] def must_be_list(obj: Union[Any, List[Any]]) -> List[Any]: """Convert a object to a list if the object is not a list. Parameters ---------- obj : Object The object to convert. Returns ------- obj: list If the input object is not a list, returns a list that only contains that object. Otherwise, returns that object. """ if isinstance(obj, list): return obj return [obj]