Source code for clldutils.oaipmh

"""
A minimalistic implementation of an OAI-PMH harvester.
"""
import typing
import datetime
import collections
import urllib.parse
import urllib.request
from xml.etree import ElementTree

from dateutil.parser import isoparse
import attr

__all__ = ['NAMESPACES', 'qname', 'Record', 'iter_records']

NAMESPACES = {
    'oai': "http://www.openarchives.org/OAI/2.0/",
    'oai_dc': "http://www.openarchives.org/OAI/2.0/oai_dc/",
    'dc': "http://purl.org/dc/elements/1.1/",
}


[docs]def qname(lname: str, prefix: str = 'oai') -> str: """ Returns a qualified name suitable for use with ElementTree's namespace-aware functionality, see https://docs.python.org/3/library/xml.etree.elementtree.html#parsing-xml-with-namespaces """ return '{%s}%s' % (NAMESPACES[prefix], lname)
[docs]@attr.s class Record: """ :ivar identifier: the unique identifier of an item in a repository. :ivar oai_dc_metadata: `None` if no `oai_dc` metadata is available, otherwise a `dict` mapping \ Dublin Core terms (specified as local names) to lists of values. """ identifier = attr.ib() datestamp = attr.ib(converter=isoparse) metadata = attr.ib( default=None, validator=attr.validators.optional(attr.validators.instance_of(ElementTree.Element))) about = attr.ib(default=attr.Factory(list)) sets = attr.ib(default=attr.Factory(list)) status = attr.ib( default=None, validator=attr.validators.optional(attr.validators.instance_of(str))) oai_dc_metadata = attr.ib( default=None, validator=attr.validators.optional(attr.validators.instance_of(dict))) @classmethod def from_element(cls, e): header = e.find(qname('header')) md = e.find(qname('metadata')) status = header.attrib.get('status') oai_dc_metadata = None # Note: Deleted items may not have metadata! if status != 'deleted': ee = md.find(qname('dc', prefix='oai_dc')) if ee is not None: oai_dc_metadata = collections.defaultdict(list) for eee in ee.iter(): if eee.tag.startswith(qname('', prefix='dc')): oai_dc_metadata[eee.tag.partition('}')[2]].append(eee.text) return cls( identifier=header.find(qname('identifier')).text, datestamp=header.find(qname('datestamp')).text, metadata=md, status=status, about=e.findall(qname('about')), sets=[ee.text for ee in header.findall(qname('setSpec'))], oai_dc_metadata=oai_dc_metadata )
class Response: def __init__(self, xml): self.xml = ElementTree.fromstring(xml) rt = self.xml.find('.//{}'.format(qname('resumptionToken'))) if isinstance(rt, ElementTree.Element): self.resumption_token = rt.text else: self.resumption_token = None def request(url, params): parsed_url = list(urllib.parse.urlparse(url)) parsed_url[4] = urllib.parse.urlencode(params) with urllib.request.urlopen(urllib.parse.urlunparse(parsed_url)) as req: return Response(req.read().decode('utf8'))
[docs]def iter_records(baseURL: str, metadataPrefix: str = 'oai_dc', from_: typing.Optional[typing.Union[str, datetime.date, datetime.datetime]] = None, until: typing.Optional[typing.Union[str, datetime.date, datetime.datetime]] = None, set_: typing.Optional[str] = None) -> typing.Generator[Record, None, None]: """ Runs a `ListRecords` request on the specified OAI-PMH repository (using resumption tokens as necessary). .. seealso:: `<https://www.openarchives.org/OAI/openarchivesprotocol.html#ListRecords>`_ .. code-block:: python >>> from clldutils.oaipmh import iter_records >>> recs = iter_records('https://account.lddjournal.org/index.php/uv1-j-ldd/oai') >>> next(recs).identifier 'oai:ojs.pkp.sfu.ca:article/2' >>> next(recs).oai_dc_metadata['identifier'] ['https://account.lddjournal.org/index.php/uv1-j-ldd/article/view/12', '10.25894/ldd12'] :param baseURL: the base URL of the repository :param metadataPrefix: specifies the metadataPrefix of the format that should be included in \ the metadata part of the returned records. :param from: an optional argument with a UTCdatetime value, which specifies a lower bound for \ datestamp-based selective harvesting. :param until: an optional argument with a UTCdatetime value, which specifies a upper bound for \ datestamp-based selective harvesting. :param set_: an optional argument with a setSpec value , which specifies set criteria for \ selective harvesting. """ def format_date(d): if isinstance(d, str): return d return d.isoformat() params = dict(verb='ListRecords', metadataPrefix=metadataPrefix) if from_: params['from'] = format_date(from_) if until: params['until'] = format_date(until) if set_: params['set'] = set_ res = request(baseURL, params) while res: for e in res.xml.findall('.//{}'.format(qname('record'))): yield Record.from_element(e) res = request(baseURL, dict(verb='ListRecords', resumptionToken=res.resumption_token)) \ if res.resumption_token else None