Source code for clldutils.source

"""
This module provides functionality to handle bibliographic metadata, i.e. structured metadata
describing sources of data/research.
"""
import re
import typing
import itertools
import collections

from pylatexenc.latex2text import LatexNodes2Text
from bibtexparser.middlewares import names

__all__ = ['Source']

ID_PATTERN = re.compile(r'^[a-zA-Z\-_0-9]+$')


[docs]class Source(collections.OrderedDict): """Bibliographic metadata about a source used for some analysis in a linguistic database. Following BibTeX-style, a `Source` is just an ordered list of key-value pairs, augmented with an id (a.k.a. BibTeX citekey) and a genre (a.k.a. Entry Types). :ivar str id: The citekey of a source. :ivar str genre: The entry type of a source. .. note:: We restrict the allowed syntax for the id to make sure it can safely be used as path component in a URL. To skip this check, pass `_check_id=False` to the constructor. Usage: .. code-block:: python >>> from clldutils.source import Source >>> src = Source('article', 'Meier2000', author='Meier', year='2000', title='The Title') >>> print(src.bibtex()) @article{Meier2000, author = {Meier}, year = {2000}, title = {The Title} } >>> print(src) Meier. 2000. The Title. """ def __init__(self, genre: str, id_: str, *args, _check_id: bool = True, _lowercase: bool = False, _strip_tex: typing.Optional[typing.Iterable[str]] = None, **kw): """ :param kw: Fields of the bibliographical record as key-value pairs. :param _check_id: Flag signaling whether to check the id or not. :param _lowercase: Flag signaling whether genre and field names should be lowercased or not. :param _strip_tex: `Iterable` of field names for which the value should be stripped from \ any TeX formatting. """ if _check_id and not ID_PATTERN.match(id_): raise ValueError(id_) self.genre = genre.lower() if _lowercase else genre if _strip_tex: _strip_tex = [k.lower() for k in _strip_tex] kw = { k: LatexNodes2Text().latex_to_text(v) if k.lower() in _strip_tex else v for k, v in kw.items()} self.id = id_ super(Source, self).__init__( *args, **{k.lower() if _lowercase else k: v for k, v in kw.items()}) def __bool__(self): # pragma: no cover return True __nonzero__ = __bool__ def __str__(self): return self.text() def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self.id)
[docs] @classmethod def from_entry(cls, key: str, entry, **_kw) -> 'Source': """ Factory method to initialize a `Source` instance from a `pybtex.database.Entry`. :param key: Citation key, e.g. a key in `pybtex.database.BibliographyData.entries`. :param entry: `pybtex.database.Entry` :param _kw: Keyword arguments passed through to `cls.__init__` """ _kw.update({k: v for k, v in entry.fields.items()}) for role in (entry.persons or []): if entry.persons[role]: _kw[role] = ' and '.join('%s' % p for p in entry.persons[role]) return cls(entry.type, key, **_kw)
[docs] @classmethod def from_bibtex(cls, bibtexString: str, lowercase: bool = False, _check_id: bool = True) \ -> 'Source': """ Initialize a `Source` object from the data in a BibTeX record. .. note:: We support somewhat limited BibTeX syntax. Thus, it's best to feed preprocessed BibTeX (e.g. using a tool such as `bibtool`). In particular, we assume all key-value-pairs to be on single lines, i.e. we don't support multiline values. Alternatively, you can parse BibTeX data using `pybtex` and feed `pybtex.database.Entry` objects to :meth:`Source.from_entry`. """ source = None lines = bibtexString.strip().split('\n') # genre and key are parsed from the @-line: atLine = re.compile(r"^@(?P<genre>[a-zA-Z_]+)\s*{\s*(?P<key>[^,]*)\s*,\s*") # since all key-value pairs fit on one line, it's easy to determine the # end of the value: right before the last closing brace! fieldLine = re.compile(r'\s*(?P<field>[a-zA-Z_]+)\s*=\s*({|")(?P<value>.+)') endLine = re.compile(r"}\s*") while lines: line = lines.pop(0) if not source: m = atLine.match(line) if m: source = cls( m.group('genre').strip().lower(), m.group('key').strip(), _check_id=_check_id) else: m = fieldLine.match(line) if m: value = m.group('value').strip() if value.endswith(','): value = value[:-1].strip() if value.endswith('}') or value.endswith('"'): field = m.group('field') if lowercase: field = field.lower() source[field] = value[:-1].strip() else: m = endLine.match(line) if m: break # Note: fields with names not matching the expected pattern are simply # ignored. return source
@staticmethod def split_names(s: str) -> typing.List[names.NameParts]: return [ names.parse_single_name_into_parts(name[:-1].strip() if name.endswith(',') else name) for name in names.split_multiple_persons_names(s.replace(' & ', ' and '))] @staticmethod def reformat_names(s: str) -> str: res = '' names = Source.split_names(s) for i, nameparts in enumerate(names): if i == 0: first = '' if nameparts.first: first += ' '.join(nameparts.first) if nameparts.von: first += ' {}'.format(' '.join(nameparts.von)) if nameparts.jr: first += ', {}'.format(' '.join(nameparts.jr)) res += '{}{}'.format(' '.join(nameparts.last), ', ' + first if first else '') else: res += ' & ' if i + 1 == len(names) else ', ' res += nameparts.merge_first_name_first return res
[docs] def bibtex(self) -> str: """ Represent the source in BibTeX format. :return: string encoding the source in BibTeX syntax. """ m = max(itertools.chain(map(len, self), [0])) fields = (" %s = {%s}" % (k.ljust(m), self[k]) for k in self) return "@%s{%s,\n%s\n}" % ( getattr(self.genre, 'value', self.genre), self.id, ",\n".join(fields))
_genre_note = { 'phdthesis': 'dissertation', 'mastersthesis': 'MA thesis', 'unpublished': 'unpublished', } def get_with_translation(self, key): res = self.get(key) if res and self.get(key + '_english'): res = '{0} [{1}]'.format(res, self.get(key + '_english')) return res @property def norm_pages(self): return (self.get('pages') or '').replace('--', '–')
[docs] def text(self, markdown=False) -> str: """ Linearize the bib source according to the rules of the unified style. :param markdown: If True, italics are used to distinguish volume titles. - Book: author. year. booktitle. (series, volume.) address: publisher. - Article: author. year. title. journal volume(issue). pages. - Incollection: author. year. title. In editor (ed.), booktitle, pages. address: publisher. .. seealso:: `<https://www.linguisticsociety.org/sites/default/files/style-sheet_0.pdf>`_ """ def fmt_edition(e): try: e = int(e) return "%d%s" % (e, "tsnrhtdd"[(e // 10 % 10 != 1) * (e % 10 < 4) * e % 10::4]) except ValueError: # pragma: no cover return e def italicized(s): if not s: return s # pragma: no cover return '_{}_'.format(s) if markdown else s genre = getattr(self.genre, 'value', self.genre) pages_at_end = genre in ( 'book', 'phdthesis', 'mastersthesis', 'misc', 'techreport') thesis = genre in ('phdthesis', 'mastersthesis') if self.get('editor'): editors = self['editor'] if self.get('author') else self.reformat_names(self['editor']) affix = 'eds' if ' and ' in editors or '&' in editors else 'ed' editors = " %s (%s.)" % (editors, affix) else: editors = None res = [ self.reformat_names(self['author']) if self.get('author') else editors, self.get('year', 'n.d')] if genre == 'book': # book title in italics. res.append( italicized( self.get_with_translation('booktitle') or # noqa: W504 self.get_with_translation('title'))) series = ', '.join(filter( None, [self.get('series'), self.get('volume', self.get('number'))])) if series: res.append('(%s.)' % series) elif genre == 'misc': # in case of misc records, we use the note field in case a title is missing. res.append(self.get_with_translation('title') or self.get('note')) else: # Dissertation title in italics. res.append( italicized(self.get_with_translation('title')) if genre == 'phdthesis' else self.get_with_translation('title')) if genre == 'article': # journal in italics! atom = ' '.join(filter(None, [italicized(self.get('journal')), self.get('volume')])) if self.get('issue') or self.get('number'): atom += '(%s)' % (self.get('issue') or self.get('number')) res.append(atom) if self.get('pages'): res.append(self.norm_pages) if self.get('doi'): res.append('doi: {}'.format( '[{0}](https://doi.org/{0})'.format(self['doi']) if markdown else self['doi'])) elif genre == 'incollection' or genre == 'inproceedings': prefix = 'In' atom = '' if editors: atom += editors if self.get('booktitle'): if atom: atom += ',' atom += " %s" % italicized(self.get_with_translation('booktitle')) if self.get('pages'): atom += ", %s" % self.norm_pages res.append(prefix + atom) else: # check for author to make sure we haven't included the editors yet. if editors and self.get('author'): res.append("In %s" % editors) for attr in [ 'journal', 'volume' if genre != 'book' else None, ]: if attr and self.get(attr): res.append(self.get(italicized(attr) if attr == 'journal' else attr)) if self.get('issue'): res.append("(%s)" % self['issue']) if not pages_at_end and self.get('pages'): res.append(self.norm_pages) thesis_handled = False if thesis and self.get('school'): res.append('{}{} {}'.format( '{}: '.format(self['address']) if self.get('address') else '', self['school'], self._genre_note.get(genre))) if self.get('pages'): res.append('({}pp.)'.format(self.norm_pages)) thesis_handled = True elif self.get('publisher'): if self.get('edition'): res.append('{} edn'.format(fmt_edition(self.get('edition')))) publisher = self.get('publisher') if self.get('address') and publisher.startswith('{}:'.format(self['address'])): res.append(self['publisher']) else: res.append(": ".join(filter(None, [self.get('address'), self['publisher']]))) else: if genre == 'misc' and self.get('howpublished'): res.append(self.get('howpublished')) if not thesis and pages_at_end and self.get('pages'): res.append(self.norm_pages + 'pp') if genre != 'article': if self.get('doi'): res.append('doi: {}'.format( '[{0}](https://doi.org/{0})'.format(self['doi']) if markdown else self['doi'])) note = self.get('note') or (self._genre_note.get(genre) if not thesis_handled else '') if note and note not in res: if thesis: joiner = ',' if self.get('pages'): note += '{0} {1}pp.'.format(joiner, self.norm_pages) res.append('(%s)' % note) return ' '.join( x if x.endswith(('.', '.)')) else '%s.' % x for x in res if x).strip()