Source code for clldutils.misc

"""
This module provides miscellaneous utility functions, including backports of newer Python
functionality (:func:`lazyproperty`), formatting functions, etc.
"""

import re
import base64
import string
import typing
import pathlib
import warnings
import mimetypes
import unicodedata

__all__ = [
    'data_url', 'log_or_raise', 'nfilter', 'to_binary', 'dict_merged', 'NoDefault', 'NO_DEFAULT',
    'xmlchars', 'format_size', 'UnicodeMixin', 'slug', 'encoded', 'lazyproperty',
]


def deprecated(msg):
    warnings.simplefilter('always', DeprecationWarning)
    warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
    warnings.simplefilter('default', DeprecationWarning)


[docs]def data_url(content: typing.Union[bytes, str, pathlib.Path], mimetype: str = None) -> str:
    """
    Returns content encoded as base64 Data URI. Useful to include (smallish) media resources
    in HTML pages.

    :param content: bytes or str or Path
    :param mimetype: mimetype of the content
    :return: `str` object (consisting only of ASCII, though)

    .. seealso:: https://en.wikipedia.org/wiki/Data_URI_scheme
    """
    if isinstance(content, pathlib.Path):
        if not mimetype:
            mimetype = mimetypes.guess_type(content.name)[0]
        with content.open('rb') as fp:
            content = fp.read()
    else:
        if isinstance(content, str):
            content = content.encode('utf8')
    return "data:{0};base64,{1}".format(
        mimetype or 'application/octet-stream', base64.b64encode(content).decode())


[docs]def log_or_raise(msg: str, log=None, level='warning', exception_cls=ValueError):
    """
    Utility for check procedures. If `log` is `None`, this works like `pytest -x`, otherwise
    the issue is just logged with the appropriate level.

    .. code-block:: python

        >>> from clldutils.misc import log_or_raise
        >>> log_or_raise("there's a problem")
        Traceback (most recent call last):
        ...
        ValueError: there's a problem
        >>> import logging
        >>> log_or_raise("there's a problem", log=logging.getLogger(__name__))
        there's a problem
    """
    if log:
        getattr(log, level)(msg)
    else:
        raise exception_cls(msg)


[docs]def nfilter(seq: typing.Iterable) -> list:
    """Replacement for python 2's filter(None, seq).

    :return: a list filtered from seq containing only truthy items.
    """
    return [e for e in seq if e]


[docs]def to_binary(s: typing.Union[str, bytes], encoding='utf8') -> bytes:
    """Cast function.

    :param s: object to be converted to bytes.
    """
    return s if isinstance(s, bytes) else bytes(s, encoding=encoding)


[docs]def dict_merged(d, _filter=None, **kw):
    """
    Update dictionary d with the items passed as kw if the value passes _filter.

    .. code-block:: python

        >>> from clldutils.misc import dict_merged
        >>> dict_merged({'a': 1}, b=2, c=3, _filter=lambda v: v > 2)
        {'a': 1, 'c': 3}
    """
    def f(s):
        if _filter:
            return _filter(s)
        return s is not None
    d = d or {}
    for k, v in kw.items():
        if f(v):
            d[k] = v
    return d


class NoDefault(object):

    def __repr__(self):
        return '<NoDefault>'


#: A singleton which can be used to distinguish no-argument-passed from None passed as
#: argument in callables with optional arguments.
NO_DEFAULT = NoDefault()


[docs]def xmlchars(text: str) -> str:
    """Not all of UTF-8 is considered valid character data in XML ...

    Thus, this function can be used to remove illegal characters from ``text``.

    .. seealso:: `<https://en.wikipedia.org/wiki/Valid_characters_in_XML>`_
    """
    invalid = list(range(0x9))
    invalid.extend([0xb, 0xc])
    invalid.extend(range(0xe, 0x20))
    return re.sub('|'.join('\\x%0.2X' % i for i in invalid), '', text)


[docs]def format_size(num: int) -> str:
    """Format byte-sizes for human readability.

    Cf. the `-h` option of the `du` command:

        -h, --human-readable print sizes in human readable format (e.g., 1K 234M 2G)

    :param num: Size given as number of bytes.

    .. seealso:: `<http://stackoverflow.com/a/1094933>`_
    """
    for x in ['bytes', 'KB', 'MB', 'GB']:
        if num < 1024.0 and num > -1024.0:
            return "%3.1f%s" % (num, x)
        num /= 1024.0
    return "%3.1f%s" % (num, 'TB')


class UnicodeMixin(object):
    """Portable label mixin."""

    def __unicode__(self):
        """a human readable label for the object."""
        return '%s' % self  # pragma: no cover

    def __str__(self):
        """a human readable label for the object, appropriately encoded (or not)."""
        deprecated("Use of deprecated class UnicodeMixin! Use object instead.")
        return self.__unicode__()


[docs]def slug(s: str, remove_whitespace: bool = True, lowercase: bool = True) -> str:
    """
    Condenses a string to contain only (lowercase) alphanumeric characters.

    .. code-block:: python

        >>> from clldutils.misc import slug
        >>> slug('Some words!')
        'somewords'
        >>> slug('Some words!', lowercase=False)
        'Somewords'
        >>> slug('Some words!', remove_whitespace=False)
        'some words'
    """
    res = ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')
    if lowercase:
        res = res.lower()
    for c in string.punctuation:
        res = res.replace(c, '')
    res = re.sub(r'\s+', '' if remove_whitespace else ' ', res)
    res = res.encode('ascii', 'ignore').decode('ascii')
    assert re.match('[ A-Za-z0-9]*$', res)
    return res


[docs]def encoded(string: typing.Union[str, bytes], encoding='utf-8') -> bytes:
    """Cast string to bytes in a specific encoding - with some guessing about the encoding.

    :param encoding: encoding which the object is forced to
    """
    assert isinstance(string, (str, bytes))
    if isinstance(string, str):
        return string.encode(encoding)
    try:
        # make sure the string can be decoded in the specified encoding ...
        string.decode(encoding)
        return string
    except UnicodeDecodeError:
        # ... if not use latin1 as best guess to decode the string before encoding as
        # specified.
        return string.decode('latin1').encode(encoding)


[docs]class lazyproperty(object):
    """Non-data descriptor caching the computed result as instance attribute.

    .. code-block:: python

        >>> class Spam(object):
        ...     @lazyproperty
        ...     def eggs(self):
        ...         return 'spamspamspam'
        >>> spam=Spam(); spam.eggs
        'spamspamspam'
        >>> spam.eggs='eggseggseggs'; spam.eggs
        'eggseggseggs'
        >>> Spam().eggs
        'spamspamspam'
        >>> Spam.eggs  # doctest: +ELLIPSIS
        <...lazyproperty object at 0x...>

    .. note::

        Since Python 3.8 added the `functools.cached_property` decorator
        (see `<https://docs.python.org/3/library/functools.html#functools.cached_property>`_),
        this function will be deprecated once Python 3.7 is no longer supported.
    """

    def __init__(self, fget):
        self.fget = fget
        for attr in ('__module__', '__name__', '__doc__'):
            setattr(self, attr, getattr(fget, attr))

    def __get__(self, instance, owner):
        if instance is None:
            return self
        result = instance.__dict__[self.__name__] = self.fget(instance)
        return result