Source code for clldutils.iso_639_3

"""
This module provides an API to the information of the ISO-639-3 standard.
ISO-639-3 data is not distributed with this package. Instead, an :class:`ISO` instance can either
be passed the path to a local copy of the zipped ISO tables or it will download them from
`<https://iso639-3.sil.org/code_tables/download_tables>`_
"""

import io
import re
import csv
import string
from typing import Union, Protocol, Optional
import pathlib
import datetime
import functools
import collections
import urllib.request
from collections.abc import Generator, Iterable

from clldutils.path import TemporaryDirectory
from clldutils.ziparchive import ZipArchive

__all__ = ['ISO', 'Code', 'download_tables']

BASE_URL = "https://iso639-3.sil.org/"
ZIP_NAME_PATTERN = re.compile(
    r'(?P<name>sites/iso639-3/files/downloads/iso-639-3_Code_Tables_[0-9]{8}.zip)"')
TABLE_NAME_PATTERN = re.compile(r'/iso-639-3(?P<name_and_date>[^.]*)\.tab')
DATESTAMP_PATTERN = re.compile(r'(2[0-9]{3})([0-1][0-9])([0-3][0-9])')
USER_AGENT = 'Mozilla'  # It seems a python user-agent doesn't cut it anymore.

# For some reason, the retirements code table gives the wrong replacement codes in two
# cases (although they are described correctly on the website):
CHANGE_TO_ERRATA = {
    'guv': ['duz'],
    'ymt': ['mtm'],
}


class HasSplitlines(Protocol):  # pylint: disable=too-few-public-methods,missing-class-docstring
    def splitlines(self) -> Iterable[str]:  # pylint: disable=C0116
        ...  # pragma: no cover


def _open(path):
    return urllib.request.urlopen(
        urllib.request.Request(BASE_URL + path, headers={'User-Agent': USER_AGENT}))


def iterrows(lines: Iterable[str]) -> Generator[collections.OrderedDict[str, str], None, None]:
    """Parse CSV lines into row dicts."""
    header = None
    for i, row in enumerate(csv.reader(io.StringIO('\n'.join(lines)), delimiter='\t')):
        if i == 0:
            header = row
        else:
            yield collections.OrderedDict(zip(header, row))


class Table(list):
    """A code table."""
    def __init__(self, name_and_date: str, date: str, fp: HasSplitlines):
        parts = name_and_date.split('_')
        # The ISO 639-3 code tables from 2020-05-15 contain a table with a
        # malformed name - having an excess "0" in the date stamp.
        if parts[-1] == '202000515':  # pragma: no cover
            date = '20200515'
        digits = map(int, DATESTAMP_PATTERN.match(date).groups())
        self.date = datetime.date(*digits)
        name = '_'.join([p for p in parts if not DATESTAMP_PATTERN.match(p)])
        if name.startswith(('_', '-')):
            name = name[1:]
        if not name:
            name = 'Codes'
        self.name = name
        super().__init__(list(iterrows(line for line in fp.splitlines() if line.strip())))


[docs]def download_tables(outdir: Optional[Union[str, pathlib.Path]] = None) -> pathlib.Path: """ Download the zipped ISO tables to `outdir` or cwd. """ match = ZIP_NAME_PATTERN.search(_open('code_tables/download_tables').read().decode('utf-8-sig')) if not match: raise ValueError('no matching zip file name found') # pragma: no cover target = pathlib.Path(outdir or '.').joinpath(match.group('name').split('/')[-1]) with target.open('wb') as fp: fp.write(_open(match.group('name')).read()) return target
def iter_tables(zippath: Optional[str] = None) -> Generator[Table, None, None]: """Yield tables from a code tables zip archive.""" with TemporaryDirectory() as tmp: if not zippath: zippath = download_tables(tmp) with ZipArchive(zippath) as archive: for name in archive.namelist(): date = DATESTAMP_PATTERN.search(name) date = name[date.start():date.end()] match = TABLE_NAME_PATTERN.search(name) if match: yield Table(match.group('name_and_date'), date, archive.read_text(name))
[docs]@functools.total_ordering class Code: """ Represents one ISO 639-3 code and its associated metadata. :ivar str code: The three-letter code :ivar str name: The language name """ _code_pattern = re.compile(r'\[([a-z]{3})]') _scope_map = { # Scopes for items from table Codes. 'I': 'Individual', 'M': 'Macrolanguage', 'S': 'Special', } _type_map = { 'L': 'Living', 'E': 'Extinct', 'A': 'Ancient', 'H': 'Historical', 'C': 'Constructed', 'S': 'Special', } _rtype_map = { 'C': 'change', 'D': 'duplicate', 'N': 'non-existent', 'S': 'split', 'M': 'merge', } def __init__(self, item: dict[str, str], tablename: str, registry: 'ISO'): code = item['Id'] self._change_to = [] self.retired: Union[bool, datetime.date] = False if tablename == 'Codes': self._scope = self._scope_map[item['Scope']] self._type = self._type_map[item['Language_Type']] elif tablename == 'Retirements': self._scope = 'Retirement' self._type = self._rtype_map[item['Ret_Reason']] if item['Ret_Reason'] else None self.retired = datetime.date(*map(int, item['Effective'].split('-'))) if code in CHANGE_TO_ERRATA: self._change_to = CHANGE_TO_ERRATA[code] # pragma: no cover else: if item['Change_To']: assert item['Change_To'] != code self._change_to = [item['Change_To']] elif item['Ret_Remedy']: self._change_to = [ c for c in self._code_pattern.findall(item['Ret_Remedy']) if c != code] elif tablename == 'Local': self._scope = 'Local' self._type = 'Special' else: raise ValueError(tablename) # pragma: no cover self.code: str = code self.name: str = item['Ref_Name'] self._registry = registry @property def type(self) -> str: """ The type of the code formatted as pair "scope/type" """ return f'{self._scope}/{self._type}' @property def is_retired(self) -> bool: """ Flag signaling whether the code is retired. """ return bool(self.retired) @property def change_to(self) -> list['Code']: """ List of codes that supersede a retired code. """ res = [] for code in self._change_to: code = self._registry[code] if not code.is_retired: res.append(code) else: res.extend(code.change_to) return res @property def is_local(self) -> bool: """ Flag signaling whether the code is in the private use area. """ return self._scope == 'Local' @property def is_macrolanguage(self) -> bool: # pylint: disable=C0116 return self._scope == 'Macrolanguage' @property def extension(self) -> list['Code']: """ The codes subsumed by a macrolanguage code. """ if not self.is_macrolanguage: return [] return [ self._registry[c] # pylint: disable=W0212 for c in self._registry._macrolanguage[self.code]] # pylint: disable=W0212 def __hash__(self): return hash(self.code) def __eq__(self, other): return self.code == other.code def __lt__(self, other): return self.code < other.code def __repr__(self): return f'<ISO-639-3 [{self.code}] {self.type}>' def __str__(self): return f'{self.name} [{self.code}]'
[docs]class ISO(collections.OrderedDict): """ Provides access to the content of ISO 639-3's downloadable code table. An `ISO` instance maps three-letter codes to :class:`Code` instances, and provides a couple of convenience methods. Usage: .. code-block:: python >>> from clldutils.iso_639_3 import ISO >>> iso = ISO('iso-639-3_Code_Tables_20220311.zip') >>> iso.retirements[0] <ISO-639-3 [fri] Retirement/change> >>> iso.retirements[0].change_to [<ISO-639-3 [fry] Individual/Living>] """ def __init__(self, zippath: Optional[Union[str, pathlib.Path]] = None): """ :param zippath: Path to a local copy of the "Complete Set of Tables" (UTF-8). If `None`, \ the tables will be retrieved from the web. """ zippath = pathlib.Path(zippath) if zippath else None self._tables = {t.name: t for t in iter_tables(zippath=zippath)} if zippath and DATESTAMP_PATTERN.search(zippath.name): digits = map(int, DATESTAMP_PATTERN.search(zippath.name).groups()) self.date: datetime.date = datetime.date(*digits) else: self.date: datetime.date = max(t.date for t in self._tables.values()) self._macrolanguage = collections.defaultdict(list) for item in self._tables['macrolanguages']: self._macrolanguage[item['M_Id']].append(item['I_Id']) super().__init__() for tablename in ['Codes', 'Retirements']: for item in self._tables[tablename]: if item['Id'] not in self: # Note: we don't keep historical retirements, i.e. ones that have only # been in effect for some time. E.g. lcq has been changed to ppr # from 2012-02-03 until 2013-01-23 when it was changed back to lcq self[item['Id']] = Code(item, tablename, self) for code in ['q' + x + y for x in string.ascii_lowercase[:string.ascii_lowercase.index('t') + 1] for y in string.ascii_lowercase]: # Codes in the local use area. self[code] = Code({'Id': code, 'Ref_Name': None}, 'Local', self) def __str__(self): return f'ISO 639-3 code tables from {self.date}'
[docs] def by_type(self, type_) -> list[Code]: """Return codes by type.""" return [c for c in self.values() if c._type == type_] # pylint: disable=protected-access
@property def living(self) -> list[Code]: """ All codes categorized as "Living" """ return self.by_type('Living') @property def extinct(self) -> list[Code]: """ All codes categorized as "Extinct" """ return self.by_type('Extinct') @property def ancient(self) -> list[Code]: """ All codes categorized as "Ancient" """ return self.by_type('Ancient') @property def historical(self) -> list[Code]: """ All codes categorized as "Historical" """ return self.by_type('Historical') @property def constructed(self) -> list[Code]: """ All codes categorized as "Constructed" """ return self.by_type('Constructed') @property def special(self) -> list[Code]: """ All codes categorized as "Special" """ return self.by_type('Special') @property def retirements(self) -> list[Code]: """ All retired codes """ return [c for c in self.values() if c.is_retired] @property def macrolanguages(self) -> list[Code]: """ All macrolanguage codes """ return [c for c in self.values() if c.is_macrolanguage] @property def languages(self) -> list[Code]: """ All active language codes """ return [c for c in self.values() if not c.is_macrolanguage and not c.is_retired and not c.is_local]