"""
This module provides an API to the information of the ISO-639-3 standard.
ISO-639-3 data is not distributed with this package. Instead, an :class:`ISO` instance can either
be passed the path to a local copy of the zipped ISO tables or it will download them from
`<https://iso639-3.sil.org/code_tables/download_tables>`_
"""
import io
import re
import csv
import string
from typing import Union, Protocol, Optional
import pathlib
import datetime
import functools
import collections
import urllib.request
from collections.abc import Generator, Iterable
from clldutils.path import TemporaryDirectory
from clldutils.ziparchive import ZipArchive
__all__ = ['ISO', 'Code', 'download_tables']
BASE_URL = "https://iso639-3.sil.org/"
ZIP_NAME_PATTERN = re.compile(
r'(?P<name>sites/iso639-3/files/downloads/iso-639-3_Code_Tables_[0-9]{8}.zip)"')
TABLE_NAME_PATTERN = re.compile(r'/iso-639-3(?P<name_and_date>[^.]*)\.tab')
DATESTAMP_PATTERN = re.compile(r'(2[0-9]{3})([0-1][0-9])([0-3][0-9])')
USER_AGENT = 'Mozilla' # It seems a python user-agent doesn't cut it anymore.
# For some reason, the retirements code table gives the wrong replacement codes in two
# cases (although they are described correctly on the website):
CHANGE_TO_ERRATA = {
'guv': ['duz'],
'ymt': ['mtm'],
}
class HasSplitlines(Protocol): # pylint: disable=too-few-public-methods,missing-class-docstring
def splitlines(self) -> Iterable[str]: # pylint: disable=C0116
... # pragma: no cover
def _open(path):
return urllib.request.urlopen(
urllib.request.Request(BASE_URL + path, headers={'User-Agent': USER_AGENT}))
def iterrows(lines: Iterable[str]) -> Generator[collections.OrderedDict[str, str], None, None]:
"""Parse CSV lines into row dicts."""
header = None
for i, row in enumerate(csv.reader(io.StringIO('\n'.join(lines)), delimiter='\t')):
if i == 0:
header = row
else:
yield collections.OrderedDict(zip(header, row))
class Table(list):
"""A code table."""
def __init__(self, name_and_date: str, date: str, fp: HasSplitlines):
parts = name_and_date.split('_')
# The ISO 639-3 code tables from 2020-05-15 contain a table with a
# malformed name - having an excess "0" in the date stamp.
if parts[-1] == '202000515': # pragma: no cover
date = '20200515'
digits = map(int, DATESTAMP_PATTERN.match(date).groups())
self.date = datetime.date(*digits)
name = '_'.join([p for p in parts if not DATESTAMP_PATTERN.match(p)])
if name.startswith(('_', '-')):
name = name[1:]
if not name:
name = 'Codes'
self.name = name
super().__init__(list(iterrows(line for line in fp.splitlines() if line.strip())))
[docs]def download_tables(outdir: Optional[Union[str, pathlib.Path]] = None) -> pathlib.Path:
"""
Download the zipped ISO tables to `outdir` or cwd.
"""
match = ZIP_NAME_PATTERN.search(_open('code_tables/download_tables').read().decode('utf-8-sig'))
if not match:
raise ValueError('no matching zip file name found') # pragma: no cover
target = pathlib.Path(outdir or '.').joinpath(match.group('name').split('/')[-1])
with target.open('wb') as fp:
fp.write(_open(match.group('name')).read())
return target
def iter_tables(zippath: Optional[str] = None) -> Generator[Table, None, None]:
"""Yield tables from a code tables zip archive."""
with TemporaryDirectory() as tmp:
if not zippath:
zippath = download_tables(tmp)
with ZipArchive(zippath) as archive:
for name in archive.namelist():
date = DATESTAMP_PATTERN.search(name)
date = name[date.start():date.end()]
match = TABLE_NAME_PATTERN.search(name)
if match:
yield Table(match.group('name_and_date'), date, archive.read_text(name))
[docs]@functools.total_ordering
class Code:
"""
Represents one ISO 639-3 code and its associated metadata.
:ivar str code: The three-letter code
:ivar str name: The language name
"""
_code_pattern = re.compile(r'\[([a-z]{3})]')
_scope_map = { # Scopes for items from table Codes.
'I': 'Individual',
'M': 'Macrolanguage',
'S': 'Special',
}
_type_map = {
'L': 'Living',
'E': 'Extinct',
'A': 'Ancient',
'H': 'Historical',
'C': 'Constructed',
'S': 'Special',
}
_rtype_map = {
'C': 'change',
'D': 'duplicate',
'N': 'non-existent',
'S': 'split',
'M': 'merge',
}
def __init__(self, item: dict[str, str], tablename: str, registry: 'ISO'):
code = item['Id']
self._change_to = []
self.retired: Union[bool, datetime.date] = False
if tablename == 'Codes':
self._scope = self._scope_map[item['Scope']]
self._type = self._type_map[item['Language_Type']]
elif tablename == 'Retirements':
self._scope = 'Retirement'
self._type = self._rtype_map[item['Ret_Reason']] if item['Ret_Reason'] else None
self.retired = datetime.date(*map(int, item['Effective'].split('-')))
if code in CHANGE_TO_ERRATA:
self._change_to = CHANGE_TO_ERRATA[code] # pragma: no cover
else:
if item['Change_To']:
assert item['Change_To'] != code
self._change_to = [item['Change_To']]
elif item['Ret_Remedy']:
self._change_to = [
c for c in self._code_pattern.findall(item['Ret_Remedy'])
if c != code]
elif tablename == 'Local':
self._scope = 'Local'
self._type = 'Special'
else:
raise ValueError(tablename) # pragma: no cover
self.code: str = code
self.name: str = item['Ref_Name']
self._registry = registry
@property
def type(self) -> str:
"""
The type of the code formatted as pair "scope/type"
"""
return f'{self._scope}/{self._type}'
@property
def is_retired(self) -> bool:
"""
Flag signaling whether the code is retired.
"""
return bool(self.retired)
@property
def change_to(self) -> list['Code']:
"""
List of codes that supersede a retired code.
"""
res = []
for code in self._change_to:
code = self._registry[code]
if not code.is_retired:
res.append(code)
else:
res.extend(code.change_to)
return res
@property
def is_local(self) -> bool:
"""
Flag signaling whether the code is in the private use area.
"""
return self._scope == 'Local'
@property
def is_macrolanguage(self) -> bool: # pylint: disable=C0116
return self._scope == 'Macrolanguage'
@property
def extension(self) -> list['Code']:
"""
The codes subsumed by a macrolanguage code.
"""
if not self.is_macrolanguage:
return []
return [
self._registry[c] # pylint: disable=W0212
for c in self._registry._macrolanguage[self.code]] # pylint: disable=W0212
def __hash__(self):
return hash(self.code)
def __eq__(self, other):
return self.code == other.code
def __lt__(self, other):
return self.code < other.code
def __repr__(self):
return f'<ISO-639-3 [{self.code}] {self.type}>'
def __str__(self):
return f'{self.name} [{self.code}]'
[docs]class ISO(collections.OrderedDict):
"""
Provides access to the content of ISO 639-3's downloadable code table.
An `ISO` instance maps three-letter codes to :class:`Code` instances, and provides a couple
of convenience methods.
Usage:
.. code-block:: python
>>> from clldutils.iso_639_3 import ISO
>>> iso = ISO('iso-639-3_Code_Tables_20220311.zip')
>>> iso.retirements[0]
<ISO-639-3 [fri] Retirement/change>
>>> iso.retirements[0].change_to
[<ISO-639-3 [fry] Individual/Living>]
"""
def __init__(self, zippath: Optional[Union[str, pathlib.Path]] = None):
"""
:param zippath: Path to a local copy of the "Complete Set of Tables" (UTF-8). If `None`, \
the tables will be retrieved from the web.
"""
zippath = pathlib.Path(zippath) if zippath else None
self._tables = {t.name: t for t in iter_tables(zippath=zippath)}
if zippath and DATESTAMP_PATTERN.search(zippath.name):
digits = map(int, DATESTAMP_PATTERN.search(zippath.name).groups())
self.date: datetime.date = datetime.date(*digits)
else:
self.date: datetime.date = max(t.date for t in self._tables.values())
self._macrolanguage = collections.defaultdict(list)
for item in self._tables['macrolanguages']:
self._macrolanguage[item['M_Id']].append(item['I_Id'])
super().__init__()
for tablename in ['Codes', 'Retirements']:
for item in self._tables[tablename]:
if item['Id'] not in self:
# Note: we don't keep historical retirements, i.e. ones that have only
# been in effect for some time. E.g. lcq has been changed to ppr
# from 2012-02-03 until 2013-01-23 when it was changed back to lcq
self[item['Id']] = Code(item, tablename, self)
for code in ['q' + x + y
for x in string.ascii_lowercase[:string.ascii_lowercase.index('t') + 1]
for y in string.ascii_lowercase]:
# Codes in the local use area.
self[code] = Code({'Id': code, 'Ref_Name': None}, 'Local', self)
def __str__(self):
return f'ISO 639-3 code tables from {self.date}'
[docs] def by_type(self, type_) -> list[Code]:
"""Return codes by type."""
return [c for c in self.values() if c._type == type_] # pylint: disable=protected-access
@property
def living(self) -> list[Code]:
"""
All codes categorized as "Living"
"""
return self.by_type('Living')
@property
def extinct(self) -> list[Code]:
"""
All codes categorized as "Extinct"
"""
return self.by_type('Extinct')
@property
def ancient(self) -> list[Code]:
"""
All codes categorized as "Ancient"
"""
return self.by_type('Ancient')
@property
def historical(self) -> list[Code]:
"""
All codes categorized as "Historical"
"""
return self.by_type('Historical')
@property
def constructed(self) -> list[Code]:
"""
All codes categorized as "Constructed"
"""
return self.by_type('Constructed')
@property
def special(self) -> list[Code]:
"""
All codes categorized as "Special"
"""
return self.by_type('Special')
@property
def retirements(self) -> list[Code]:
"""
All retired codes
"""
return [c for c in self.values() if c.is_retired]
@property
def macrolanguages(self) -> list[Code]:
"""
All macrolanguage codes
"""
return [c for c in self.values() if c.is_macrolanguage]
@property
def languages(self) -> list[Code]:
"""
All active language codes
"""
return [c for c in self.values()
if not c.is_macrolanguage and not c.is_retired and not c.is_local]