commit 79e6b82f7fcee0f9b09ef32af27fb3835041015b parent b320b144f884c1b3a761a5d1d9cfe9b89f28927a Author: Stefan Koch <programming@stefan-koch.name> Date: Sat, 5 Aug 2023 14:00:15 +0200 rename to nlparrot Diffstat:
25 files changed, 104 insertions(+), 252 deletions(-)
diff --git a/Dockerfile b/Dockerfile @@ -8,16 +8,16 @@ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu COPY requirements.txt . RUN pip install -r requirements.txt -COPY pyproject.toml /opt/alphadistill/ -COPY src/alphadistill /opt/alphadistill/src/alphadistill -RUN pip install -e /opt/alphadistill +COPY pyproject.toml /opt/nlparrot/ +COPY src/nlparrot /opt/nlparrot/src/nlparrot +RUN pip install -e /opt/nlparrot RUN python -c 'import stanza; [stanza.download(model) for model in ["multilingual", "hr"]]' \ && python -m spacy download hr_core_news_sm # TODO: Does not play well together with model download out-of-the-box -# RUN useradd --create-home alphadistill -# WORKDIR /home/alphadistill -# USER alphadistill +# RUN useradd --create-home nlparrot +# WORKDIR /home/nlparrot +# USER nlparrot -CMD ["python", "/opt/alphadistill/src/alphadistill/server.py"] +CMD ["python", "/opt/nlparrot/src/nlparrot/server.py"] diff --git a/Makefile b/Makefile @@ -0,0 +1,9 @@ +.PHONY: check test + +check: + black --check . + isort --check . + lint-imports + +test: + pytest tests diff --git a/pyproject.toml b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "alphadistill" +name = "nlparrot" version = "0.0.1" dependencies = [ 'classla', @@ -10,8 +10,21 @@ dependencies = [ 'sudachidict_core', ] +[project.optional-dependencies] +tests = [ + 'pytest', +] + [tool.setuptools] package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] + +[tool.black] +line-length = 120 + +[tool.isort] +profile = "black" +line_length = 120 +skip_gitignore = true diff --git a/requirements.txt b/requirements.txt @@ -16,7 +16,7 @@ certifi==2023.5.7 charset-normalizer==3.1.0 # via requests classla==1.1.0 - # via alphadistill (setup.py) + # via nlparrot (setup.py) click==8.1.3 # via typer cmake==3.26.4 @@ -111,7 +111,7 @@ pydantic==1.10.10 # spacy # thinc pyphen==0.14.0 - # via alphadistill (setup.py) + # via nlparrot (setup.py) regex==2023.6.3 # via obeliks reldi-tokeniser==1.0.2 @@ -128,7 +128,7 @@ smart-open==6.3.0 # pathy # spacy spacy==3.5.4 - # via alphadistill (setup.py) + # via nlparrot (setup.py) spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.4 @@ -139,12 +139,12 @@ srsly==2.4.6 # spacy # thinc stanza==1.5.0 - # via alphadistill (setup.py) + # via nlparrot (setup.py) sudachidict-core==20230110 - # via alphadistill (setup.py) + # via nlparrot (setup.py) sudachipy==0.6.7 # via - # alphadistill (setup.py) + # nlparrot (setup.py) # sudachidict-core sympy==1.12 # via torch diff --git a/src/alphadistill/grammartree/croatian.py b/src/alphadistill/grammartree/croatian.py @@ -1,26 +0,0 @@ -import stanza - -from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder - - -class CroatianGrammarTreeBuilder(StanzaGrammarTreeBuilder): - def __init__(self, nonstandard=False): - self._nonstandard = nonstandard - self._stanza = None - - def _get_stanza(self): - type_ = 'nonstandard' if self._nonstandard else 'default' - - if self._stanza is None: - # TODO: For some reason suddenly I do not get a "start_char" anymore - # from classla. Using original stanza for the time being. - self._stanza = stanza.Pipeline( - 'hr', - processors='tokenize,pos,lemma,depparse', - ) - # self._stanza = classla.Pipeline( - # 'hr', - # type=type_, - # ) - - return self._stanza diff --git a/src/alphadistill/grammartree/engines.py b/src/alphadistill/grammartree/engines.py @@ -1,108 +0,0 @@ -import stanza - -from lingobox.grammar_tree import Node - - -class StanzaGrammarTreeBuilder: - def __init__(self, language): - self._language = language - self._stanza = None - - def grammar_tree(self, text): - # classla (and probably also stanza) performs strip() on lines and - # re-sets the char counter on newline. To maintain a correct char - # counter we need to perform this on our own and memorize what we - # removed. - char_offset = 0 - trees = [] - nlp = self._get_stanza() - - for paragraph in text.split('\n'): - lstripped = paragraph.lstrip() - lstripped_count = len(paragraph) - len(lstripped) - stripped = lstripped.rstrip() - rstripped_count = len(lstripped) - len(stripped) - - doc = nlp(stripped) - - char_offset += lstripped_count - - for sentence in doc.sentences: - tree = self._stanza_to_tree(sentence) - self._adjust_char_pos(tree, char_offset) - - trees.append(tree) - - char_offset += len(stripped) - char_offset += rstripped_count - # add one for the split \n - char_offset += 1 - - return trees - - def _get_stanza(self): - if self._stanza is None: - self._stanza = stanza.Pipeline( - self._language, - processors='tokenize,mwt,pos,lemma,depparse,ner', - ) - - return self._stanza - - def _stanza_to_tree(self, sentence): - root = None - object_ids = {} - - # first create all nodes to make sure they exist - for token in sentence.tokens: - data = token.to_dict()[0] - object_ids[data['id']] = Node( - text=data['text'], - lemma=data['lemma'] if 'lemma' in data else None, - pos_short=data['upos'] if 'upos' in data else None, - pos_full=data['xpos'] if 'xpos' in data else None, - deprel=data['deprel'] if 'deprel' in data else None, - # TODO: According to stanza documentation, a token should have - # a public start_char and end_char. - # Maybe only classla does not have this? - start=token._start_char, - end=token._end_char, - children=[], - ) - - # then add children accordingly (make sure that order remains - # the same as in sentence for each children list) - for token in sentence.tokens: - data = token.to_dict()[0] - headless_tokens = [] - if 'head' not in data: - # TODO: The word Al in the sentence - # "El asalto a la mezquita de Al Aqsa desemboca en la mayor - # escalada de violencia en la frontera de Israel y LĂbano - # desde 2006" - # does not have a "head". - headless_tokens.append(data) - elif data['head'] == 0: - # TODO: For now assume that there is exactly a single root. - # Did not see a specification on this yet, but it makes - # things easier for us later. If it does not hold, we can - # create a dummy root or work with a list. - # Docs for CoNLL-U at https://universaldependencies.org/format.html - if root is not None: - raise Exception('Expecting a single root') - - root = object_ids[data['id']] - else: - object_ids[data['head']].children.append(object_ids[data['id']]) - - for token in headless_tokens: - root.children.append(token) - - return root - - def _adjust_char_pos(self, tree, char_count): - tree.start += char_count - tree.end += char_count - - for child in tree.children: - self._adjust_char_pos(child, char_count) diff --git a/src/alphadistill/grammartree/factory.py b/src/alphadistill/grammartree/factory.py @@ -1,16 +0,0 @@ -from lingobox.server.grammartree.croatian import CroatianGrammarTreeBuilder -from lingobox.server.grammartree.engines import StanzaGrammarTreeBuilder -from lingobox.server.grammartree.japanese import JapaneseKanjiGrammarTreeBuilder - - -def get_tree_builder(language): - if language.lower() == 'croatian': - return CroatianGrammarTreeBuilder() - if language.lower() == 'english': - return StanzaGrammarTreeBuilder('en') - if language.lower() == 'japanese': - return JapaneseKanjiGrammarTreeBuilder() - if language.lower() == 'spanish': - return StanzaGrammarTreeBuilder('es') - - return None diff --git a/src/alphadistill/grammartree/japanese.py b/src/alphadistill/grammartree/japanese.py @@ -1,21 +0,0 @@ -from lingobox.grammar_tree import Node - - -class JapaneseKanjiGrammarTreeBuilder: - def grammar_tree(self, text): - trees = [] - for pos, char in enumerate(text): - if char.isspace(): - continue - - trees.append(Node( - text=char, - lemma=char, - pos_short=None, - pos_full=None, - deprel=None, - start=pos, - end=pos+1, - children=[], - )) - return trees diff --git a/src/alphadistill/server.py b/src/alphadistill/server.py @@ -1,60 +0,0 @@ -from multiprocessing.connection import Listener -import os - -from alphadistill.detection import detect_language -from alphadistill.tokenization.factory import get_tokenizers -from alphadistill.readability import flesch_reading_ease - - -def get_listen_address(): - host = os.getenv('LINGOBOX_SERVER_LISTEN_HOST') - port = os.getenv('LINGOBOX_SERVER_LISTEN_PORT') - - if host and port: - return {'address': (host, int(port)), 'family': 'AF_INET'} - else: - return {'address': '/tmp/lingobox/lingobox.sock', 'family': 'AF_UNIX'} - - -if __name__ == '__main__': - print('Starting server ...') - - current_language = None - tokenizers = None - - # TODO: Receive supported languages from client program - supported_languages = ['hr', 'ja'] - - with Listener(**get_listen_address()) as listener: - print('Listening for connection ...') - - while True: - with listener.accept() as conn: - task: dict = conn.recv() - - if task['language'] is None: - language = detect_language(task['text'], supported_languages=supported_languages) - print(f'Auto-detected language "{language}"') - else: - language = task['language'] - print(f'Received language "{language}"') - - if tokenizers is None or current_language != language: - tokenizers = get_tokenizers(language) - current_language = language - - # TODO: Generic way - if language == 'hr': - reading_ease = flesch_reading_ease.croatian(task['text']) - else: - reading_ease = None - - tokens = {} - for tokenizer_type, tokenizer in tokenizers.items(): - tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])] - - conn.send({ - 'language': language, - 'tokens': tokens, - 'flesch_reading_ease': reading_ease, - }) diff --git a/src/alphadistill/tokenization/__init__.py b/src/alphadistill/tokenization/__init__.py diff --git a/src/alphadistill/__init__.py b/src/nlparrot/__init__.py diff --git a/src/alphadistill/detection.py b/src/nlparrot/detection.py diff --git a/src/alphadistill/grammartree/__init__.py b/src/nlparrot/readability/__init__.py diff --git a/src/alphadistill/readability/flesch_reading_ease.py b/src/nlparrot/readability/flesch_reading_ease.py diff --git a/src/nlparrot/server.py b/src/nlparrot/server.py @@ -0,0 +1,61 @@ +from multiprocessing.connection import Listener +import os + +from nlparrot.detection import detect_language +from nlparrot.tokenization.factory import get_tokenizers +from nlparrot.readability import flesch_reading_ease + + +def get_listen_address(): + host = os.getenv('NLPARROT_LISTEN_HOST') + port = os.getenv('NLPARROT_LISTEN_PORT') + socket_path = os.getenv('NLPARROT_SOCKET_PATH') + + if host and port: + return {'address': (host, int(port)), 'family': 'AF_INET'} + else: + return {'address': socket_path, 'family': 'AF_UNIX'} + + +if __name__ == '__main__': + print('Starting server ...') + + current_language = None + tokenizers = None + + # TODO: Receive supported languages from client program + supported_languages = ['hr', 'ja'] + + with Listener(**get_listen_address()) as listener: + print('Listening for connection ...') + + while True: + with listener.accept() as conn: + task: dict = conn.recv() + + if task['language'] is None: + language = detect_language(task['text'], supported_languages=supported_languages) + print(f'Auto-detected language "{language}"') + else: + language = task['language'] + print(f'Received language "{language}"') + + if tokenizers is None or current_language != language: + tokenizers = get_tokenizers(language) + current_language = language + + # TODO: Generic way + if language == 'hr': + reading_ease = flesch_reading_ease.croatian(task['text']) + else: + reading_ease = None + + tokens = {} + for tokenizer_type, tokenizer in tokenizers.items(): + tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])] + + conn.send({ + 'language': language, + 'tokens': tokens, + 'flesch_reading_ease': reading_ease, + }) diff --git a/src/alphadistill/readability/__init__.py b/src/nlparrot/tokenization/__init__.py diff --git a/src/alphadistill/tokenization/croatian.py b/src/nlparrot/tokenization/croatian.py diff --git a/src/alphadistill/tokenization/factory.py b/src/nlparrot/tokenization/factory.py diff --git a/src/alphadistill/tokenization/generic.py b/src/nlparrot/tokenization/generic.py diff --git a/src/alphadistill/tokenization/japanese.py b/src/nlparrot/tokenization/japanese.py diff --git a/tests/readability/test_flesch_reading_ease.py b/tests/readability/test_flesch_reading_ease.py @@ -1,4 +1,4 @@ -from alphadistill.readability.flesch_reading_ease import croatian +from nlparrot.readability.flesch_reading_ease import croatian def test_croatian(): diff --git a/tests/test_detection.py b/tests/test_detection.py @@ -1,6 +1,6 @@ import pytest -from alphadistill.detection import detect_language +from nlparrot.detection import detect_language @pytest.mark.parametrize( diff --git a/tests/tokenization/test_croatian.py b/tests/tokenization/test_croatian.py @@ -1,5 +1,5 @@ -from alphadistill.tokenization.generic import Token -from alphadistill.tokenization.croatian import ( +from nlparrot.tokenization.generic import Token +from nlparrot.tokenization.croatian import ( CroatianVocabularyTokenizer, ) diff --git a/tests/tokenization/test_generic.py b/tests/tokenization/test_generic.py @@ -1,4 +1,4 @@ -from alphadistill.tokenization.generic import Token +from nlparrot.tokenization.generic import Token def test_token_serialization(): diff --git a/tests/tokenization/test_japanese.py b/tests/tokenization/test_japanese.py @@ -1,5 +1,5 @@ -from alphadistill.tokenization.generic import Token -from alphadistill.tokenization.japanese import ( +from nlparrot.tokenization.generic import Token +from nlparrot.tokenization.japanese import ( JapaneseKanjiTokenizer, JapaneseWordTokenizer, )