commit 5c67ae4daaaca499ffbf60996a91f54840a75fe0
Author: Stefan Koch <programming@stefan-koch.name>
Date: Sun, 2 Jul 2023 17:36:59 +0200
move server from lingobox into separate repo alphadistill
Diffstat:
22 files changed, 608 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDE settings
+.idea
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.11-slim-bookworm
+
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY setup.py /opt/alphadistill/
+COPY alphadistill /opt/alphadistill/alphadistill
+RUN pip install -e /opt/alphadistill
+
+RUN useradd --create-home alphadistill
+WORKDIR /home/alphadistill
+USER alphadistill
+
+# TODO: Pre-download all required models
+
+CMD ["python", "/opt/alphadistill/alphadistill/server.py"]
diff --git a/alphadistill/__init__.py b/alphadistill/__init__.py
diff --git a/alphadistill/detection.py b/alphadistill/detection.py
@@ -0,0 +1,12 @@
+import stanza
+
+
+def detect_language(text: str, supported_languages: list[str]) -> str:
+ # TODO: Cache pipeline
+ nlp = stanza.Pipeline(
+ lang='multilingual',
+ processors='langid',
+ # restrict to supported languages
+ langid_lang_subset=supported_languages,
+ )
+ return nlp(text).lang
diff --git a/alphadistill/grammartree/__init__.py b/alphadistill/grammartree/__init__.py
diff --git a/alphadistill/grammartree/croatian.py b/alphadistill/grammartree/croatian.py
@@ -0,0 +1,26 @@
+import stanza
+
+from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder
+
+
+class CroatianGrammarTreeBuilder(StanzaGrammarTreeBuilder):
+ def __init__(self, nonstandard=False):
+ self._nonstandard = nonstandard
+ self._stanza = None
+
+ def _get_stanza(self):
+ type_ = 'nonstandard' if self._nonstandard else 'default'
+
+ if self._stanza is None:
+ # TODO: For some reason suddenly I do not get a "start_char" anymore
+ # from classla. Using original stanza for the time being.
+ self._stanza = stanza.Pipeline(
+ 'hr',
+ processors='tokenize,pos,lemma,depparse',
+ )
+ # self._stanza = classla.Pipeline(
+ # 'hr',
+ # type=type_,
+ # )
+
+ return self._stanza
diff --git a/alphadistill/grammartree/engines.py b/alphadistill/grammartree/engines.py
@@ -0,0 +1,108 @@
+import stanza
+
+from lingobox.grammar_tree import Node
+
+
+class StanzaGrammarTreeBuilder:
+ def __init__(self, language):
+ self._language = language
+ self._stanza = None
+
+ def grammar_tree(self, text):
+ # classla (and probably also stanza) performs strip() on lines and
+ # re-sets the char counter on newline. To maintain a correct char
+ # counter we need to perform this on our own and memorize what we
+ # removed.
+ char_offset = 0
+ trees = []
+ nlp = self._get_stanza()
+
+ for paragraph in text.split('\n'):
+ lstripped = paragraph.lstrip()
+ lstripped_count = len(paragraph) - len(lstripped)
+ stripped = lstripped.rstrip()
+ rstripped_count = len(lstripped) - len(stripped)
+
+ doc = nlp(stripped)
+
+ char_offset += lstripped_count
+
+ for sentence in doc.sentences:
+ tree = self._stanza_to_tree(sentence)
+ self._adjust_char_pos(tree, char_offset)
+
+ trees.append(tree)
+
+ char_offset += len(stripped)
+ char_offset += rstripped_count
+ # add one for the split \n
+ char_offset += 1
+
+ return trees
+
+ def _get_stanza(self):
+ if self._stanza is None:
+ self._stanza = stanza.Pipeline(
+ self._language,
+ processors='tokenize,mwt,pos,lemma,depparse,ner',
+ )
+
+ return self._stanza
+
+ def _stanza_to_tree(self, sentence):
+ root = None
+ object_ids = {}
+
+ # first create all nodes to make sure they exist
+ for token in sentence.tokens:
+ data = token.to_dict()[0]
+ object_ids[data['id']] = Node(
+ text=data['text'],
+ lemma=data['lemma'] if 'lemma' in data else None,
+ pos_short=data['upos'] if 'upos' in data else None,
+ pos_full=data['xpos'] if 'xpos' in data else None,
+ deprel=data['deprel'] if 'deprel' in data else None,
+ # TODO: According to stanza documentation, a token should have
+ # a public start_char and end_char.
+ # Maybe only classla does not have this?
+ start=token._start_char,
+ end=token._end_char,
+ children=[],
+ )
+
+ # then add children accordingly (make sure that order remains
+ # the same as in sentence for each children list)
+ for token in sentence.tokens:
+ data = token.to_dict()[0]
+ headless_tokens = []
+ if 'head' not in data:
+ # TODO: The word Al in the sentence
+ # "El asalto a la mezquita de Al Aqsa desemboca en la mayor
+ # escalada de violencia en la frontera de Israel y Líbano
+ # desde 2006"
+ # does not have a "head".
+ headless_tokens.append(data)
+ elif data['head'] == 0:
+ # TODO: For now assume that there is exactly a single root.
+ # Did not see a specification on this yet, but it makes
+ # things easier for us later. If it does not hold, we can
+ # create a dummy root or work with a list.
+ # Docs for CoNLL-U at https://universaldependencies.org/format.html
+ if root is not None:
+ raise Exception('Expecting a single root')
+
+ root = object_ids[data['id']]
+ else:
+ object_ids[data['head']].children.append(object_ids[data['id']])
+
+ for token in headless_tokens:
+ root.children.append(token)
+
+ return root
+
+ def _adjust_char_pos(self, tree, char_count):
+ tree.start += char_count
+ tree.end += char_count
+
+ for child in tree.children:
+ self._adjust_char_pos(child, char_count)
diff --git a/alphadistill/grammartree/factory.py b/alphadistill/grammartree/factory.py
@@ -0,0 +1,16 @@
+from lingobox.server.grammartree.croatian import CroatianGrammarTreeBuilder
+from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder
+from lingobox.server.grammartree.japanese import JapaneseKanjiGrammarTreeBuilder
+
+
+def get_tree_builder(language):
+ if language.lower() == 'croatian':
+ return CroatianGrammarTreeBuilder()
+ if language.lower() == 'english':
+ return StanzaGrammarTreeBuilder('en')
+ if language.lower() == 'japanese':
+ return JapaneseKanjiGrammarTreeBuilder()
+ if language.lower() == 'spanish':
+ return StanzaGrammarTreeBuilder('es')
+
+ return None
diff --git a/alphadistill/grammartree/japanese.py b/alphadistill/grammartree/japanese.py
@@ -0,0 +1,21 @@
+from lingobox.grammar_tree import Node
+
+
+class JapaneseKanjiGrammarTreeBuilder:
+ def grammar_tree(self, text):
+ trees = []
+ for pos, char in enumerate(text):
+ if char.isspace():
+ continue
+
+ trees.append(Node(
+ text=char,
+ lemma=char,
+ pos_short=None,
+ pos_full=None,
+ deprel=None,
+ start=pos,
+ end=pos+1,
+ children=[],
+ ))
+ return trees
diff --git a/alphadistill/server.py b/alphadistill/server.py
@@ -0,0 +1,48 @@
+from multiprocessing.connection import Listener
+import os
+
+from alphadistill.detection import detect_language
+from alphadistill.tokenization.factory import get_tokenizer
+
+
+def get_listen_address():
+ host = os.getenv('LINGOBOX_SERVER_LISTEN_HOST')
+ port = os.getenv('LINGOBOX_SERVER_LISTEN_PORT')
+
+ if host and port:
+ return {'address': (host, int(port)), 'family': 'AF_INET'}
+ else:
+ return {'address': '/tmp/lingobox/lingobox.sock', 'family': 'AF_UNIX'}
+
+
+if __name__ == '__main__':
+ print('Starting server ...')
+
+ current_language = None
+ tokenizer = None
+
+ # TODO: Receive supported languages from client program
+ supported_languages = ['hr', 'ja']
+
+ with Listener(**get_listen_address()) as listener:
+ print('Listening for connection ...')
+
+ while True:
+ with listener.accept() as conn:
+ task: dict = conn.recv()
+
+ if task['language'] is None:
+ language = detect_language(task['text'], supported_languages=supported_languages)
+ print(f'Auto-detected language "{language}"')
+ else:
+ language = task['language']
+ print(f'Received language "{language}"')
+
+ if tokenizer is None or current_language != language:
+ tokenizer = get_tokenizer(language)
+ current_language = language
+
+ conn.send({
+ 'language': language,
+ 'tokens': [token.to_dict() for token in tokenizer.tokenize(task['text'])],
+ })
diff --git a/alphadistill/tokenization/__init__.py b/alphadistill/tokenization/__init__.py
diff --git a/alphadistill/tokenization/croatian.py b/alphadistill/tokenization/croatian.py
@@ -0,0 +1,65 @@
+import collections.abc
+import stanza
+
+from .generic import (
+ Token,
+ Tokenizer,
+)
+
+
+class CroatianVocabularyTokenizer(Tokenizer):
+ def __init__(self, nonstandard=False):
+ self._nonstandard = nonstandard
+ self._stanza = None
+
+ def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
+ # classla (and probably also stanza) performs strip() on lines and
+ # re-sets the char counter on newline. To maintain a correct char
+ # counter we need to perform this on our own and memorize what we
+ # removed.
+ char_offset = 0
+ nlp = self._get_stanza()
+
+ for paragraph in text.split('\n'):
+ lstripped = paragraph.lstrip()
+ lstripped_count = len(paragraph) - len(lstripped)
+ stripped = lstripped.rstrip()
+ rstripped_count = len(lstripped) - len(stripped)
+
+ doc = nlp(stripped)
+
+ char_offset += lstripped_count
+
+ for sentence in doc.sentences:
+ for token in sentence.tokens:
+ data = token.to_dict()[0]
+
+ if data['upos'].lower() not in ['punct', 'sym']:
+ yield Token(
+ start=token.start_char + char_offset,
+ end=token.end_char + char_offset,
+ token=data['lemma'],
+ original_text=data['text'],
+ )
+
+ char_offset += len(stripped)
+ char_offset += rstripped_count
+ # add one for the split \n
+ char_offset += 1
+
+ def _get_stanza(self):
+ type_ = 'nonstandard' if self._nonstandard else 'default'
+
+ if self._stanza is None:
+ # TODO: For some reason suddenly I do not get a "start_char" anymore
+ # from classla. Using original stanza for the time being.
+ self._stanza = stanza.Pipeline(
+ 'hr',
+ processors='tokenize,pos,lemma,depparse',
+ )
+ # self._stanza = classla.Pipeline(
+ # 'hr',
+ # type=type_,
+ # )
+
+ return self._stanza
diff --git a/alphadistill/tokenization/factory.py b/alphadistill/tokenization/factory.py
@@ -0,0 +1,11 @@
+from .croatian import CroatianVocabularyTokenizer
+from .japanese import JapaneseKanjiTokenizer
+
+
+def get_tokenizer(language_code: str):
+ if language_code.lower() == 'hr':
+ return CroatianVocabularyTokenizer()
+ if language_code.lower() == 'ja':
+ return JapaneseKanjiTokenizer()
+
+ return None
diff --git a/alphadistill/tokenization/generic.py b/alphadistill/tokenization/generic.py
@@ -0,0 +1,29 @@
+import abc
+import collections.abc
+import dataclasses
+
+
+@dataclasses.dataclass
+class Token:
+ start: int
+ end: int
+ token: str
+ original_text: str
+
+ def to_dict(self):
+ return {
+ 'start': self.start,
+ 'end': self.end,
+ 'token': self.token,
+ 'original_text': self.original_text,
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict):
+ return Token(**data)
+
+
+class Tokenizer(abc.ABC):
+ @abc.abstractmethod
+ def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
+ pass
diff --git a/alphadistill/tokenization/japanese.py b/alphadistill/tokenization/japanese.py
@@ -0,0 +1,35 @@
+import collections.abc
+from sudachipy import tokenizer
+from sudachipy import dictionary
+
+from .generic import (
+ Token,
+ Tokenizer,
+)
+
+
+class JapaneseKanjiTokenizer(Tokenizer):
+ def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
+ for i, char in enumerate(text):
+ if ord('\u3400') <= ord(char) <= ord('\u4dbf') or ord('\u4e00') <= ord(char) <= ord('\u9faf'):
+ yield Token(
+ start=i,
+ end=i + 1,
+ token=char,
+ original_text=char,
+ )
+
+
+class JapaneseWordTokenizer(Tokenizer):
+ def __init__(self):
+ self._tokenizer_obj = dictionary.Dictionary(dict_type='core').create()
+ self._mode = tokenizer.Tokenizer.SplitMode.B
+
+ def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
+ for token in self._tokenizer_obj.tokenize(text, self._mode):
+ yield Token(
+ start=token.begin(),
+ end=token.end(),
+ token=token.dictionary_form(),
+ original_text=token.surface(),
+ )
diff --git a/setup.py b/setup.py
@@ -0,0 +1,19 @@
+from setuptools import setup
+
+setup(
+ name='alphadistill',
+ version='0.1',
+ description='Language learning tools NLP server',
+ author='Stefan Koch',
+ author_email='programming@stefan-koch.name',
+ packages=[
+ 'alphadistill',
+ 'alphadistill.tokenization',
+ ],
+ install_requires=[
+ 'classla',
+ 'stanza',
+ 'sudachipy',
+ 'sudachidict_core',
+ ],
+)
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_detection.py b/tests/test_detection.py
@@ -0,0 +1,14 @@
+import pytest
+
+from alphadistill.detection import detect_language
+
+
+@pytest.mark.parametrize(
+ ['text', 'expected_language'],
+ [
+ ('This is an English sentence.', 'en'),
+ ('Esta es una frase en español.', 'es'),
+ ]
+)
+def test_detect_language(text, expected_language):
+ assert detect_language(text, supported_languages=['en', 'es', 'hr', 'ja']) == expected_language
diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py
diff --git a/tests/tokenization/test_croatian.py b/tests/tokenization/test_croatian.py
@@ -0,0 +1,20 @@
+from alphadistill.tokenization.generic import Token
+from alphadistill.tokenization.croatian import (
+ CroatianVocabularyTokenizer,
+)
+
+
+def test_croatian_vocabulary_tokenizer_keeps_whitespace():
+ tokenizer = CroatianVocabularyTokenizer()
+
+ # classla re-sets the char counter on each newline, and trims
+ # each line. We also count newline chars and whitespace.
+ result = list(tokenizer.tokenize(' Učim.\n\n\n Svijet je prepun prilika.'))
+
+ assert result == [
+ Token(start=1, end=5, token='učiti', original_text='Učim'),
+ Token(start=12, end=18, token='svijet', original_text='Svijet'),
+ Token(start=19, end=21, token='biti', original_text='je'),
+ Token(start=22, end=28, token='prepun', original_text='prepun'),
+ Token(start=29, end=36, token='prilika', original_text='prilika'),
+ ]
diff --git a/tests/tokenization/test_generic.py b/tests/tokenization/test_generic.py
@@ -0,0 +1,6 @@
+from alphadistill.tokenization.generic import Token
+
+
+def test_token_serialization():
+ token = Token(start=10, end=12, token='test', original_text='tests')
+ assert Token.from_dict(token.to_dict()) == token
diff --git a/tests/tokenization/test_japanese.py b/tests/tokenization/test_japanese.py
@@ -0,0 +1,25 @@
+from alphadistill.tokenization.generic import Token
+from alphadistill.tokenization.japanese import (
+ JapaneseKanjiTokenizer,
+ JapaneseWordTokenizer,
+)
+
+
+def test_japanese_kanji_tokenizer():
+ tokenizer = JapaneseKanjiTokenizer()
+ result = list(tokenizer.tokenize('米を食べたい'))
+ assert result == [
+ Token(start=0, end=1, token='米', original_text='米'),
+ Token(start=2, end=3, token='食', original_text='食'),
+ ]
+
+
+def test_japanese_word_tokenizer():
+ tokenizer = JapaneseWordTokenizer()
+ result = list(tokenizer.tokenize('米を食べたい'))
+ assert result == [
+ Token(start=0, end=1, token='米', original_text='米'),
+ Token(start=1, end=2, token='を', original_text='を'),
+ Token(start=2, end=4, token='食べる', original_text='食べ'),
+ Token(start=4, end=6, token='たい', original_text='たい'),
+ ]