nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit 5c67ae4daaaca499ffbf60996a91f54840a75fe0
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Sun,  2 Jul 2023 17:36:59 +0200

move server from lingobox into separate repo alphadistill

Diffstat:
A.gitignore | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ADockerfile | 21+++++++++++++++++++++
Aalphadistill/__init__.py | 0
Aalphadistill/detection.py | 12++++++++++++
Aalphadistill/grammartree/__init__.py | 0
Aalphadistill/grammartree/croatian.py | 26++++++++++++++++++++++++++
Aalphadistill/grammartree/engines.py | 108+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aalphadistill/grammartree/factory.py | 16++++++++++++++++
Aalphadistill/grammartree/japanese.py | 21+++++++++++++++++++++
Aalphadistill/server.py | 48++++++++++++++++++++++++++++++++++++++++++++++++
Aalphadistill/tokenization/__init__.py | 0
Aalphadistill/tokenization/croatian.py | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aalphadistill/tokenization/factory.py | 11+++++++++++
Aalphadistill/tokenization/generic.py | 29+++++++++++++++++++++++++++++
Aalphadistill/tokenization/japanese.py | 35+++++++++++++++++++++++++++++++++++
Asetup.py | 19+++++++++++++++++++
Atests/__init__.py | 0
Atests/test_detection.py | 14++++++++++++++
Atests/tokenization/__init__.py | 0
Atests/tokenization/test_croatian.py | 20++++++++++++++++++++
Atests/tokenization/test_generic.py | 6++++++
Atests/tokenization/test_japanese.py | 25+++++++++++++++++++++++++
22 files changed, 608 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IDE settings +.idea diff --git a/Dockerfile b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.11-slim-bookworm + +ENV VIRTUAL_ENV=/opt/venv +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +RUN pip install torch --index-url https://download.pytorch.org/whl/cpu +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY setup.py /opt/alphadistill/ +COPY alphadistill /opt/alphadistill/alphadistill +RUN pip install -e /opt/alphadistill + +RUN useradd --create-home alphadistill +WORKDIR /home/alphadistill +USER alphadistill + +# TODO: Pre-download all required models + +CMD ["python", "/opt/alphadistill/alphadistill/server.py"] diff --git a/alphadistill/__init__.py b/alphadistill/__init__.py diff --git a/alphadistill/detection.py b/alphadistill/detection.py @@ -0,0 +1,12 @@ +import stanza + + +def detect_language(text: str, supported_languages: list[str]) -> str: + # TODO: Cache pipeline + nlp = stanza.Pipeline( + lang='multilingual', + processors='langid', + # restrict to supported languages + langid_lang_subset=supported_languages, + ) + return nlp(text).lang diff --git a/alphadistill/grammartree/__init__.py b/alphadistill/grammartree/__init__.py diff --git a/alphadistill/grammartree/croatian.py b/alphadistill/grammartree/croatian.py @@ -0,0 +1,26 @@ +import stanza + +from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder + + +class CroatianGrammarTreeBuilder(StanzaGrammarTreeBuilder): + def __init__(self, nonstandard=False): + self._nonstandard = nonstandard + self._stanza = None + + def _get_stanza(self): + type_ = 'nonstandard' if self._nonstandard else 'default' + + if self._stanza is None: + # TODO: For some reason suddenly I do not get a "start_char" anymore + # from classla. Using original stanza for the time being. + self._stanza = stanza.Pipeline( + 'hr', + processors='tokenize,pos,lemma,depparse', + ) + # self._stanza = classla.Pipeline( + # 'hr', + # type=type_, + # ) + + return self._stanza diff --git a/alphadistill/grammartree/engines.py b/alphadistill/grammartree/engines.py @@ -0,0 +1,108 @@ +import stanza + +from lingobox.grammar_tree import Node + + +class StanzaGrammarTreeBuilder: + def __init__(self, language): + self._language = language + self._stanza = None + + def grammar_tree(self, text): + # classla (and probably also stanza) performs strip() on lines and + # re-sets the char counter on newline. To maintain a correct char + # counter we need to perform this on our own and memorize what we + # removed. + char_offset = 0 + trees = [] + nlp = self._get_stanza() + + for paragraph in text.split('\n'): + lstripped = paragraph.lstrip() + lstripped_count = len(paragraph) - len(lstripped) + stripped = lstripped.rstrip() + rstripped_count = len(lstripped) - len(stripped) + + doc = nlp(stripped) + + char_offset += lstripped_count + + for sentence in doc.sentences: + tree = self._stanza_to_tree(sentence) + self._adjust_char_pos(tree, char_offset) + + trees.append(tree) + + char_offset += len(stripped) + char_offset += rstripped_count + # add one for the split \n + char_offset += 1 + + return trees + + def _get_stanza(self): + if self._stanza is None: + self._stanza = stanza.Pipeline( + self._language, + processors='tokenize,mwt,pos,lemma,depparse,ner', + ) + + return self._stanza + + def _stanza_to_tree(self, sentence): + root = None + object_ids = {} + + # first create all nodes to make sure they exist + for token in sentence.tokens: + data = token.to_dict()[0] + object_ids[data['id']] = Node( + text=data['text'], + lemma=data['lemma'] if 'lemma' in data else None, + pos_short=data['upos'] if 'upos' in data else None, + pos_full=data['xpos'] if 'xpos' in data else None, + deprel=data['deprel'] if 'deprel' in data else None, + # TODO: According to stanza documentation, a token should have + # a public start_char and end_char. + # Maybe only classla does not have this? + start=token._start_char, + end=token._end_char, + children=[], + ) + + # then add children accordingly (make sure that order remains + # the same as in sentence for each children list) + for token in sentence.tokens: + data = token.to_dict()[0] + headless_tokens = [] + if 'head' not in data: + # TODO: The word Al in the sentence + # "El asalto a la mezquita de Al Aqsa desemboca en la mayor + # escalada de violencia en la frontera de Israel y Líbano + # desde 2006" + # does not have a "head". + headless_tokens.append(data) + elif data['head'] == 0: + # TODO: For now assume that there is exactly a single root. + # Did not see a specification on this yet, but it makes + # things easier for us later. If it does not hold, we can + # create a dummy root or work with a list. + # Docs for CoNLL-U at https://universaldependencies.org/format.html + if root is not None: + raise Exception('Expecting a single root') + + root = object_ids[data['id']] + else: + object_ids[data['head']].children.append(object_ids[data['id']]) + + for token in headless_tokens: + root.children.append(token) + + return root + + def _adjust_char_pos(self, tree, char_count): + tree.start += char_count + tree.end += char_count + + for child in tree.children: + self._adjust_char_pos(child, char_count) diff --git a/alphadistill/grammartree/factory.py b/alphadistill/grammartree/factory.py @@ -0,0 +1,16 @@ +from lingobox.server.grammartree.croatian import CroatianGrammarTreeBuilder +from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder +from lingobox.server.grammartree.japanese import JapaneseKanjiGrammarTreeBuilder + + +def get_tree_builder(language): + if language.lower() == 'croatian': + return CroatianGrammarTreeBuilder() + if language.lower() == 'english': + return StanzaGrammarTreeBuilder('en') + if language.lower() == 'japanese': + return JapaneseKanjiGrammarTreeBuilder() + if language.lower() == 'spanish': + return StanzaGrammarTreeBuilder('es') + + return None diff --git a/alphadistill/grammartree/japanese.py b/alphadistill/grammartree/japanese.py @@ -0,0 +1,21 @@ +from lingobox.grammar_tree import Node + + +class JapaneseKanjiGrammarTreeBuilder: + def grammar_tree(self, text): + trees = [] + for pos, char in enumerate(text): + if char.isspace(): + continue + + trees.append(Node( + text=char, + lemma=char, + pos_short=None, + pos_full=None, + deprel=None, + start=pos, + end=pos+1, + children=[], + )) + return trees diff --git a/alphadistill/server.py b/alphadistill/server.py @@ -0,0 +1,48 @@ +from multiprocessing.connection import Listener +import os + +from alphadistill.detection import detect_language +from alphadistill.tokenization.factory import get_tokenizer + + +def get_listen_address(): + host = os.getenv('LINGOBOX_SERVER_LISTEN_HOST') + port = os.getenv('LINGOBOX_SERVER_LISTEN_PORT') + + if host and port: + return {'address': (host, int(port)), 'family': 'AF_INET'} + else: + return {'address': '/tmp/lingobox/lingobox.sock', 'family': 'AF_UNIX'} + + +if __name__ == '__main__': + print('Starting server ...') + + current_language = None + tokenizer = None + + # TODO: Receive supported languages from client program + supported_languages = ['hr', 'ja'] + + with Listener(**get_listen_address()) as listener: + print('Listening for connection ...') + + while True: + with listener.accept() as conn: + task: dict = conn.recv() + + if task['language'] is None: + language = detect_language(task['text'], supported_languages=supported_languages) + print(f'Auto-detected language "{language}"') + else: + language = task['language'] + print(f'Received language "{language}"') + + if tokenizer is None or current_language != language: + tokenizer = get_tokenizer(language) + current_language = language + + conn.send({ + 'language': language, + 'tokens': [token.to_dict() for token in tokenizer.tokenize(task['text'])], + }) diff --git a/alphadistill/tokenization/__init__.py b/alphadistill/tokenization/__init__.py diff --git a/alphadistill/tokenization/croatian.py b/alphadistill/tokenization/croatian.py @@ -0,0 +1,65 @@ +import collections.abc +import stanza + +from .generic import ( + Token, + Tokenizer, +) + + +class CroatianVocabularyTokenizer(Tokenizer): + def __init__(self, nonstandard=False): + self._nonstandard = nonstandard + self._stanza = None + + def tokenize(self, text: str) -> collections.abc.Iterable[Token]: + # classla (and probably also stanza) performs strip() on lines and + # re-sets the char counter on newline. To maintain a correct char + # counter we need to perform this on our own and memorize what we + # removed. + char_offset = 0 + nlp = self._get_stanza() + + for paragraph in text.split('\n'): + lstripped = paragraph.lstrip() + lstripped_count = len(paragraph) - len(lstripped) + stripped = lstripped.rstrip() + rstripped_count = len(lstripped) - len(stripped) + + doc = nlp(stripped) + + char_offset += lstripped_count + + for sentence in doc.sentences: + for token in sentence.tokens: + data = token.to_dict()[0] + + if data['upos'].lower() not in ['punct', 'sym']: + yield Token( + start=token.start_char + char_offset, + end=token.end_char + char_offset, + token=data['lemma'], + original_text=data['text'], + ) + + char_offset += len(stripped) + char_offset += rstripped_count + # add one for the split \n + char_offset += 1 + + def _get_stanza(self): + type_ = 'nonstandard' if self._nonstandard else 'default' + + if self._stanza is None: + # TODO: For some reason suddenly I do not get a "start_char" anymore + # from classla. Using original stanza for the time being. + self._stanza = stanza.Pipeline( + 'hr', + processors='tokenize,pos,lemma,depparse', + ) + # self._stanza = classla.Pipeline( + # 'hr', + # type=type_, + # ) + + return self._stanza diff --git a/alphadistill/tokenization/factory.py b/alphadistill/tokenization/factory.py @@ -0,0 +1,11 @@ +from .croatian import CroatianVocabularyTokenizer +from .japanese import JapaneseKanjiTokenizer + + +def get_tokenizer(language_code: str): + if language_code.lower() == 'hr': + return CroatianVocabularyTokenizer() + if language_code.lower() == 'ja': + return JapaneseKanjiTokenizer() + + return None diff --git a/alphadistill/tokenization/generic.py b/alphadistill/tokenization/generic.py @@ -0,0 +1,29 @@ +import abc +import collections.abc +import dataclasses + + +@dataclasses.dataclass +class Token: + start: int + end: int + token: str + original_text: str + + def to_dict(self): + return { + 'start': self.start, + 'end': self.end, + 'token': self.token, + 'original_text': self.original_text, + } + + @classmethod + def from_dict(cls, data: dict): + return Token(**data) + + +class Tokenizer(abc.ABC): + @abc.abstractmethod + def tokenize(self, text: str) -> collections.abc.Iterable[Token]: + pass diff --git a/alphadistill/tokenization/japanese.py b/alphadistill/tokenization/japanese.py @@ -0,0 +1,35 @@ +import collections.abc +from sudachipy import tokenizer +from sudachipy import dictionary + +from .generic import ( + Token, + Tokenizer, +) + + +class JapaneseKanjiTokenizer(Tokenizer): + def tokenize(self, text: str) -> collections.abc.Iterable[Token]: + for i, char in enumerate(text): + if ord('\u3400') <= ord(char) <= ord('\u4dbf') or ord('\u4e00') <= ord(char) <= ord('\u9faf'): + yield Token( + start=i, + end=i + 1, + token=char, + original_text=char, + ) + + +class JapaneseWordTokenizer(Tokenizer): + def __init__(self): + self._tokenizer_obj = dictionary.Dictionary(dict_type='core').create() + self._mode = tokenizer.Tokenizer.SplitMode.B + + def tokenize(self, text: str) -> collections.abc.Iterable[Token]: + for token in self._tokenizer_obj.tokenize(text, self._mode): + yield Token( + start=token.begin(), + end=token.end(), + token=token.dictionary_form(), + original_text=token.surface(), + ) diff --git a/setup.py b/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup + +setup( + name='alphadistill', + version='0.1', + description='Language learning tools NLP server', + author='Stefan Koch', + author_email='programming@stefan-koch.name', + packages=[ + 'alphadistill', + 'alphadistill.tokenization', + ], + install_requires=[ + 'classla', + 'stanza', + 'sudachipy', + 'sudachidict_core', + ], +) diff --git a/tests/__init__.py b/tests/__init__.py diff --git a/tests/test_detection.py b/tests/test_detection.py @@ -0,0 +1,14 @@ +import pytest + +from alphadistill.detection import detect_language + + +@pytest.mark.parametrize( + ['text', 'expected_language'], + [ + ('This is an English sentence.', 'en'), + ('Esta es una frase en español.', 'es'), + ] +) +def test_detect_language(text, expected_language): + assert detect_language(text, supported_languages=['en', 'es', 'hr', 'ja']) == expected_language diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py diff --git a/tests/tokenization/test_croatian.py b/tests/tokenization/test_croatian.py @@ -0,0 +1,20 @@ +from alphadistill.tokenization.generic import Token +from alphadistill.tokenization.croatian import ( + CroatianVocabularyTokenizer, +) + + +def test_croatian_vocabulary_tokenizer_keeps_whitespace(): + tokenizer = CroatianVocabularyTokenizer() + + # classla re-sets the char counter on each newline, and trims + # each line. We also count newline chars and whitespace. + result = list(tokenizer.tokenize(' Učim.\n\n\n Svijet je prepun prilika.')) + + assert result == [ + Token(start=1, end=5, token='učiti', original_text='Učim'), + Token(start=12, end=18, token='svijet', original_text='Svijet'), + Token(start=19, end=21, token='biti', original_text='je'), + Token(start=22, end=28, token='prepun', original_text='prepun'), + Token(start=29, end=36, token='prilika', original_text='prilika'), + ] diff --git a/tests/tokenization/test_generic.py b/tests/tokenization/test_generic.py @@ -0,0 +1,6 @@ +from alphadistill.tokenization.generic import Token + + +def test_token_serialization(): + token = Token(start=10, end=12, token='test', original_text='tests') + assert Token.from_dict(token.to_dict()) == token diff --git a/tests/tokenization/test_japanese.py b/tests/tokenization/test_japanese.py @@ -0,0 +1,25 @@ +from alphadistill.tokenization.generic import Token +from alphadistill.tokenization.japanese import ( + JapaneseKanjiTokenizer, + JapaneseWordTokenizer, +) + + +def test_japanese_kanji_tokenizer(): + tokenizer = JapaneseKanjiTokenizer() + result = list(tokenizer.tokenize('米を食べたい')) + assert result == [ + Token(start=0, end=1, token='米', original_text='米'), + Token(start=2, end=3, token='食', original_text='食'), + ] + + +def test_japanese_word_tokenizer(): + tokenizer = JapaneseWordTokenizer() + result = list(tokenizer.tokenize('米を食べたい')) + assert result == [ + Token(start=0, end=1, token='米', original_text='米'), + Token(start=1, end=2, token='を', original_text='を'), + Token(start=2, end=4, token='食べる', original_text='食べ'), + Token(start=4, end=6, token='たい', original_text='たい'), + ]