Source code for clldutils.oaipmh

"""
A minimalistic implementation of an OAI-PMH harvester.
"""
import typing
import datetime
import collections
import urllib.parse
import urllib.request
from xml.etree import ElementTree

from dateutil.parser import isoparse
import attr

__all__ = ['NAMESPACES', 'qname', 'Record', 'iter_records']

NAMESPACES = {
    'oai': "http://www.openarchives.org/OAI/2.0/",
    'oai_dc': "http://www.openarchives.org/OAI/2.0/oai_dc/",
    'dc': "http://purl.org/dc/elements/1.1/",
}


[docs]def qname(lname: str, prefix: str = 'oai') -> str:
    """
    Returns a qualified name suitable for use with ElementTree's namespace-aware functionality,
    see https://docs.python.org/3/library/xml.etree.elementtree.html#parsing-xml-with-namespaces
    """
    return '{%s}%s' % (NAMESPACES[prefix], lname)


[docs]@attr.s
class Record:
    """
    :ivar identifier: the unique identifier of an item in a repository.
    :ivar oai_dc_metadata: `None` if no `oai_dc` metadata is available, otherwise a `dict` mapping \
    Dublin Core terms (specified as local names) to lists of values.
    """
    identifier = attr.ib()
    datestamp = attr.ib(converter=isoparse)
    metadata = attr.ib(
        default=None,
        validator=attr.validators.optional(attr.validators.instance_of(ElementTree.Element)))
    about = attr.ib(default=attr.Factory(list))
    sets = attr.ib(default=attr.Factory(list))
    status = attr.ib(
        default=None,
        validator=attr.validators.optional(attr.validators.instance_of(str)))
    oai_dc_metadata = attr.ib(
        default=None,
        validator=attr.validators.optional(attr.validators.instance_of(dict)))

    @classmethod
    def from_element(cls, e):
        header = e.find(qname('header'))
        md = e.find(qname('metadata'))
        status = header.attrib.get('status')
        oai_dc_metadata = None
        # Note: Deleted items may not have metadata!
        if status != 'deleted':
            ee = md.find(qname('dc', prefix='oai_dc'))
            if ee is not None:
                oai_dc_metadata = collections.defaultdict(list)
                for eee in ee.iter():
                    if eee.tag.startswith(qname('', prefix='dc')):
                        oai_dc_metadata[eee.tag.partition('}')[2]].append(eee.text)

        return cls(
            identifier=header.find(qname('identifier')).text,
            datestamp=header.find(qname('datestamp')).text,
            metadata=md,
            status=status,
            about=e.findall(qname('about')),
            sets=[ee.text for ee in header.findall(qname('setSpec'))],
            oai_dc_metadata=oai_dc_metadata
        )


class Response:
    def __init__(self, xml):
        self.xml = ElementTree.fromstring(xml)
        rt = self.xml.find('.//{}'.format(qname('resumptionToken')))
        if isinstance(rt, ElementTree.Element):
            self.resumption_token = rt.text
        else:
            self.resumption_token = None


def request(url, params):
    parsed_url = list(urllib.parse.urlparse(url))
    parsed_url[4] = urllib.parse.urlencode(params)
    with urllib.request.urlopen(urllib.parse.urlunparse(parsed_url)) as req:
        return Response(req.read().decode('utf8'))


[docs]def iter_records(baseURL: str,
                 metadataPrefix: str = 'oai_dc',
                 from_: typing.Optional[typing.Union[str, datetime.date, datetime.datetime]] = None,
                 until: typing.Optional[typing.Union[str, datetime.date, datetime.datetime]] = None,
                 set_: typing.Optional[str] = None) -> typing.Generator[Record, None, None]:
    """
    Runs a `ListRecords` request on the specified OAI-PMH repository (using resumption tokens as
    necessary).

    .. seealso:: `<https://www.openarchives.org/OAI/openarchivesprotocol.html#ListRecords>`_

    .. code-block:: python

        >>> from clldutils.oaipmh import iter_records
        >>> recs = iter_records('https://account.lddjournal.org/index.php/uv1-j-ldd/oai')
        >>> next(recs).identifier
        'oai:ojs.pkp.sfu.ca:article/2'
        >>> next(recs).oai_dc_metadata['identifier']
        ['https://account.lddjournal.org/index.php/uv1-j-ldd/article/view/12', '10.25894/ldd12']

    :param baseURL: the base URL of the repository
    :param metadataPrefix: specifies the metadataPrefix of the format that should be included in \
    the metadata part of the returned records.
    :param from: an optional argument with a UTCdatetime value, which specifies a lower bound for \
    datestamp-based selective harvesting.
    :param until: an optional argument with a UTCdatetime value, which specifies a upper bound for \
    datestamp-based selective harvesting.
    :param set_: an optional argument with a setSpec value , which specifies set criteria for \
    selective harvesting.
    """
    def format_date(d):
        if isinstance(d, str):
            return d
        return d.isoformat()

    params = dict(verb='ListRecords', metadataPrefix=metadataPrefix)
    if from_:
        params['from'] = format_date(from_)
    if until:
        params['until'] = format_date(until)
    if set_:
        params['set'] = set_
    res = request(baseURL, params)
    while res:
        for e in res.xml.findall('.//{}'.format(qname('record'))):
            yield Record.from_element(e)
        res = request(baseURL, dict(verb='ListRecords', resumptionToken=res.resumption_token)) \
            if res.resumption_token else None