"""
This module provides miscellaneous utility functions, including backports of newer Python
functionality (:func:`lazyproperty`), formatting functions, etc.
"""
import re
import base64
import string
from typing import Union, Any
import pathlib
import warnings
import mimetypes
import unicodedata
from collections.abc import Iterable
__all__ = [
'data_url', 'log_or_raise', 'nfilter', 'to_binary', 'dict_merged', 'NoDefault', 'NO_DEFAULT',
'xmlchars', 'format_size', 'slug', 'encoded',
]
def deprecated(msg): # pragma: no cover
"""Mark deprecated functionality."""
warnings.simplefilter('always', DeprecationWarning)
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
warnings.simplefilter('default', DeprecationWarning)
[docs]def data_url(content: Union[bytes, str, pathlib.Path], mimetype: str = None) -> str:
"""
Returns content encoded as base64 Data URI. Useful to include (smallish) media resources
in HTML pages.
:param content: bytes or str or Path
:param mimetype: mimetype of the content
:return: `str` object (consisting only of ASCII, though)
.. seealso:: https://en.wikipedia.org/wiki/Data_URI_scheme
"""
if isinstance(content, pathlib.Path):
if not mimetype:
mimetype = mimetypes.guess_type(content.name)[0]
with content.open('rb') as fp:
content = fp.read()
else:
if isinstance(content, str):
content = content.encode('utf8')
mimetype = mimetype or 'application/octet-stream'
return f"data:{mimetype};base64,{base64.b64encode(content).decode()}"
[docs]def log_or_raise(msg: str, log=None, level='warning', exception_cls=ValueError):
"""
Utility for check procedures. If `log` is `None`, this works like `pytest -x`, otherwise
the issue is just logged with the appropriate level.
.. code-block:: python
>>> from clldutils.misc import log_or_raise
>>> log_or_raise("there's a problem")
Traceback (most recent call last):
...
ValueError: there's a problem
>>> import logging
>>> log_or_raise("there's a problem", log=logging.getLogger(__name__))
there's a problem
"""
if log:
getattr(log, level)(msg)
else:
raise exception_cls(msg)
[docs]def nfilter(seq: Iterable[Any]) -> list[Any]:
"""Replacement for python 2's filter(None, seq).
:return: a list filtered from seq containing only truthy items.
"""
return [e for e in seq if e]
[docs]def to_binary(s: Union[str, bytes], encoding='utf8') -> bytes:
"""Cast function.
:param s: object to be converted to bytes.
"""
return s if isinstance(s, bytes) else bytes(s, encoding=encoding)
[docs]def dict_merged(d, _filter=None, **kw):
"""
Update dictionary d with the items passed as kw if the value passes _filter.
.. code-block:: python
>>> from clldutils.misc import dict_merged
>>> dict_merged({'a': 1}, b=2, c=3, _filter=lambda v: v > 2)
{'a': 1, 'c': 3}
"""
def f(s):
if _filter:
return _filter(s)
return s is not None
d = d or {}
for k, v in kw.items():
if f(v):
d[k] = v
return d
[docs]class NoDefault: # pylint: disable=too-few-public-methods
"""A default object for cases, where `None` is considered a regular value."""
def __repr__(self):
return '<NoDefault>'
#: A singleton which can be used to distinguish no-argument-passed from None passed as
#: argument in callables with optional arguments.
NO_DEFAULT = NoDefault()
[docs]def xmlchars(text: str) -> str:
"""Not all of UTF-8 is considered valid character data in XML ...
Thus, this function can be used to remove illegal characters from ``text``.
.. seealso:: `<https://en.wikipedia.org/wiki/Valid_characters_in_XML>`_
"""
invalid = list(range(0x9))
invalid.extend([0xb, 0xc])
invalid.extend(range(0xe, 0x20))
return re.sub(
'|'.join('\\x%0.2X' % i for i in invalid), '', text) # pylint: disable=C0209
[docs]def slug(s: str, remove_whitespace: bool = True, lowercase: bool = True) -> str:
"""
Condenses a string to contain only (lowercase) alphanumeric characters.
.. code-block:: python
>>> from clldutils.misc import slug
>>> slug('Some words!')
'somewords'
>>> slug('Some words!', lowercase=False)
'Somewords'
>>> slug('Some words!', remove_whitespace=False)
'some words'
"""
res = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
if lowercase:
res = res.lower()
for c in string.punctuation:
res = res.replace(c, '')
res = re.sub(r'\s+', '' if remove_whitespace else ' ', res)
res = res.encode('ascii', 'ignore').decode('ascii')
assert re.match('[ A-Za-z0-9]*$', res)
return res
[docs]def encoded(string_: Union[str, bytes], encoding='utf-8') -> bytes:
"""Cast string to bytes in a specific encoding - with some guessing about the encoding.
:param encoding: encoding which the object is forced to
"""
assert isinstance(string_, (str, bytes))
if isinstance(string_, str):
return string_.encode(encoding)
try:
# make sure the string can be decoded in the specified encoding ...
string_.decode(encoding)
return string_
except UnicodeDecodeError:
# ... if not use latin1 as best guess to decode the string before encoding as
# specified.
return string_.decode('latin1').encode(encoding)