Source code for clldutils.metadata

"""
JSON-LD - the serialization format used for metadata in CLDF datasets - supports nested data.
To make creating (and reading) this data simpler, this module provides a Python API to build data
structures which "know" how to read from/serialize to JSON-LD.

Usage:

.. code-block:: python

    >>> from clldutils.metadata import *
    >>> md = Metadata(
    ...     title='The Data',
    ...     publisher=Publisher(name='Data Press', place='anywhere'),
    ...     license=License(name='CC-BY-4.0'))
    >>> md.to_jsonld()['dc:license']
    OrderedDict([('name', 'Creative Commons Attribution 4.0'),
                 ('url', 'https://creativecommons.org/licenses/by/4.0/'),
                 ('icon', 'cc-by.png')])
    >>> Metadata.from_jsonld(md.to_jsonld()).publisher.place
    'anywhere'
"""
import collections
import urllib.parse

import attr

from clldutils import licenses

__all__ = ['Publisher', 'License', 'Metadata']


[docs]@attr.s
class Publisher:
    """
    The entity publishing a dataset.

    :ivar name: Name of the publisher.
    :ivar place: Place or address of the publisher, used in "traditional" publisher formats.
    :ivar url: URL linking to the "homepage" of the publisher.
    :ivar contact: An email address under which to contact the publisher of a dataset.
    """
    name = attr.ib(
        metadata=dict(ldkey="http://xmlns.com/foaf/0.1/name"),
        default=None)
    place = attr.ib(
        metadata=dict(ldkey="dc:Location"),
        default=None)
    url = attr.ib(
        metadata=dict(ldkey="http://xmlns.com/foaf/0.1/homepage"),
        default=None)
    contact = attr.ib(
        metadata=dict(ldkey="http://xmlns.com/foaf/0.1/mbox"),
        default=None)


[docs]@attr.s
class License:
    """
    The license under which a dataset is published, characterized with name, URL and an icon.
    """
    name = attr.ib(
        default="Creative Commons Attribution 4.0 International License")
    url = attr.ib(
        default="https://creativecommons.org/licenses/by/4.0/")
    icon = attr.ib(
        default="cc-by.png")

    def __attrs_post_init__(self):
        lic = licenses.find(self.name)
        if lic:
            self.name = lic.name
            self.url = lic.url


[docs]@attr.s
class Metadata:
    """
    Metadata about the published version(s) of a dataset.

    :ivar Publisher publisher: The organisation or institution publishing the dataset.
    :ivar License license: The license under which the dataset can be used.
    :ivar str url: A URL under which the dataset can be browsed.
    :ivar str title: The title of the dataset.
    :ivar str description:
    """
    publisher = attr.ib(default=Publisher(), validator=attr.validators.instance_of(Publisher))
    license = attr.ib(default=License(), validator=attr.validators.instance_of(License))
    url = attr.ib(default=None)
    title = attr.ib(default=None)
    description = attr.ib(default=None)

    @classmethod
    def from_jsonld(cls, d, defaults=None):
        defaults = defaults or {}
        kw = {}
        for k, v in [
            ('dcat:accessURL', 'url'),
            ('dc:title', 'title'),
            ('dc:description', 'description'),
        ]:
            val = d.get(k) or defaults.get(v)
            if val:
                kw[v] = val
        for ldkey, cls_ in [('dc:publisher', Publisher), ('dc:license', License)]:
            ckw = {}
            dd = d.get(ldkey, {})
            for f in attr.fields(cls_):
                ckw[f.name] = dd.get(f.metadata.get('ldkey', f.name)) \
                    or defaults.get('{0}.{1}'.format(ldkey.split(':')[1], f.name))
            kw[cls_.__name__.lower()] = cls_(**{k: v for k, v in ckw.items() if v})
        return cls(**kw)

[docs]    def to_jsonld(self) -> collections.OrderedDict:
        """
        Returns a `dict` suitable for serialization as JSON-LD object, with the metadata tagged
        with suitable common properties.
        """
        items = [("@context", ["http://www.w3.org/ns/csvw", {"@language": "en"}])]
        for k, v in [
            ('dcat:accessURL', 'url'),
            ('dc:title', 'title'),
            ('dc:description', 'description'),
        ]:
            if getattr(self, v):
                items.append((k, getattr(self, v)))
        for ldkey, cls_ in [('dc:publisher', Publisher), ('dc:license', License)]:
            obj = getattr(self, ldkey.split(':')[1])
            json = collections.OrderedDict()
            for f in attr.fields(cls_):
                if getattr(obj, f.name):
                    json[f.metadata.get('ldkey', f.name)] = getattr(obj, f.name)
            items.append((ldkey, json))
        return collections.OrderedDict(items)

    @property
    def domain(self):
        return urllib.parse.urlparse(self.url).netloc