nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit 3158f9f15349ff7cfa0838714511f8c6c930b81f
parent 79e6b82f7fcee0f9b09ef32af27fb3835041015b
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Sat,  5 Aug 2023 14:01:07 +0200

apply black formatting

Diffstat:
Msrc/nlparrot/detection.py | 4++--
Msrc/nlparrot/readability/flesch_reading_ease.py | 2+-
Msrc/nlparrot/server.py | 44+++++++++++++++++++++++---------------------
Msrc/nlparrot/tokenization/croatian.py | 14+++++++-------
Msrc/nlparrot/tokenization/factory.py | 10+++++-----
Msrc/nlparrot/tokenization/generic.py | 8++++----
Msrc/nlparrot/tokenization/japanese.py | 4++--
Mtests/readability/test_flesch_reading_ease.py | 13++++++++-----
Mtests/test_detection.py | 10+++++-----
Mtests/tokenization/test_croatian.py | 12++++++------
Mtests/tokenization/test_generic.py | 2+-
Mtests/tokenization/test_japanese.py | 16++++++++--------
12 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/src/nlparrot/detection.py b/src/nlparrot/detection.py @@ -4,8 +4,8 @@ import stanza def detect_language(text: str, supported_languages: list[str]) -> str: # TODO: Cache pipeline nlp = stanza.Pipeline( - lang='multilingual', - processors='langid', + lang="multilingual", + processors="langid", # restrict to supported languages langid_lang_subset=supported_languages, # TODO: Add a DEV mode in which we use DownloadMethod.DOWNLOAD_RESOURCES diff --git a/src/nlparrot/readability/flesch_reading_ease.py b/src/nlparrot/readability/flesch_reading_ease.py @@ -11,7 +11,7 @@ def croatian(text: str): total_sentences = _sentence_count(doc) total_words = _word_count(doc) - dic = pyphen.Pyphen(lang='hr') + dic = pyphen.Pyphen(lang="hr") total_syllables = _syllable_count(doc, dic) word_length = total_syllables / total_words diff --git a/src/nlparrot/server.py b/src/nlparrot/server.py @@ -7,37 +7,37 @@ from nlparrot.readability import flesch_reading_ease def get_listen_address(): - host = os.getenv('NLPARROT_LISTEN_HOST') - port = os.getenv('NLPARROT_LISTEN_PORT') - socket_path = os.getenv('NLPARROT_SOCKET_PATH') + host = os.getenv("NLPARROT_LISTEN_HOST") + port = os.getenv("NLPARROT_LISTEN_PORT") + socket_path = os.getenv("NLPARROT_SOCKET_PATH") if host and port: - return {'address': (host, int(port)), 'family': 'AF_INET'} + return {"address": (host, int(port)), "family": "AF_INET"} else: - return {'address': socket_path, 'family': 'AF_UNIX'} + return {"address": socket_path, "family": "AF_UNIX"} -if __name__ == '__main__': - print('Starting server ...') +if __name__ == "__main__": + print("Starting server ...") current_language = None tokenizers = None # TODO: Receive supported languages from client program - supported_languages = ['hr', 'ja'] + supported_languages = ["hr", "ja"] with Listener(**get_listen_address()) as listener: - print('Listening for connection ...') + print("Listening for connection ...") while True: with listener.accept() as conn: task: dict = conn.recv() - if task['language'] is None: - language = detect_language(task['text'], supported_languages=supported_languages) + if task["language"] is None: + language = detect_language(task["text"], supported_languages=supported_languages) print(f'Auto-detected language "{language}"') else: - language = task['language'] + language = task["language"] print(f'Received language "{language}"') if tokenizers is None or current_language != language: @@ -45,17 +45,19 @@ if __name__ == '__main__': current_language = language # TODO: Generic way - if language == 'hr': - reading_ease = flesch_reading_ease.croatian(task['text']) + if language == "hr": + reading_ease = flesch_reading_ease.croatian(task["text"]) else: reading_ease = None tokens = {} for tokenizer_type, tokenizer in tokenizers.items(): - tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])] - - conn.send({ - 'language': language, - 'tokens': tokens, - 'flesch_reading_ease': reading_ease, - }) + tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task["text"])] + + conn.send( + { + "language": language, + "tokens": tokens, + "flesch_reading_ease": reading_ease, + } + ) diff --git a/src/nlparrot/tokenization/croatian.py b/src/nlparrot/tokenization/croatian.py @@ -20,7 +20,7 @@ class CroatianVocabularyTokenizer(Tokenizer): char_offset = 0 nlp = self._get_stanza() - for paragraph in text.split('\n'): + for paragraph in text.split("\n"): lstripped = paragraph.lstrip() lstripped_count = len(paragraph) - len(lstripped) stripped = lstripped.rstrip() @@ -34,12 +34,12 @@ class CroatianVocabularyTokenizer(Tokenizer): for token in sentence.tokens: data = token.to_dict()[0] - if data['upos'].lower() not in ['punct', 'sym']: + if data["upos"].lower() not in ["punct", "sym"]: yield Token( start=token.start_char + char_offset, end=token.end_char + char_offset, - token=data['lemma'], - original_text=data['text'], + token=data["lemma"], + original_text=data["text"], ) char_offset += len(stripped) @@ -48,14 +48,14 @@ class CroatianVocabularyTokenizer(Tokenizer): char_offset += 1 def _get_stanza(self): - type_ = 'nonstandard' if self._nonstandard else 'default' + type_ = "nonstandard" if self._nonstandard else "default" if self._stanza is None: # TODO: For some reason suddenly I do not get a "start_char" anymore # from classla. Using original stanza for the time being. self._stanza = stanza.Pipeline( - 'hr', - processors='tokenize,pos,lemma,depparse', + "hr", + processors="tokenize,pos,lemma,depparse", # TODO: Add a DEV mode in which we use DownloadMethod.DOWNLOAD_RESOURCES download_method=stanza.DownloadMethod.REUSE_RESOURCES, ) diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py @@ -6,14 +6,14 @@ from .japanese import ( def get_tokenizers(language_code: str): - if language_code.lower() == 'hr': + if language_code.lower() == "hr": return { - 'vocabulary': CroatianVocabularyTokenizer(), + "vocabulary": CroatianVocabularyTokenizer(), } - if language_code.lower() == 'ja': + if language_code.lower() == "ja": return { - 'kanji': JapaneseKanjiTokenizer(), - 'vocabulary': JapaneseWordTokenizer(), + "kanji": JapaneseKanjiTokenizer(), + "vocabulary": JapaneseWordTokenizer(), } return None diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py @@ -12,10 +12,10 @@ class Token: def to_dict(self): return { - 'start': self.start, - 'end': self.end, - 'token': self.token, - 'original_text': self.original_text, + "start": self.start, + "end": self.end, + "token": self.token, + "original_text": self.original_text, } @classmethod diff --git a/src/nlparrot/tokenization/japanese.py b/src/nlparrot/tokenization/japanese.py @@ -11,7 +11,7 @@ from .generic import ( class JapaneseKanjiTokenizer(Tokenizer): def tokenize(self, text: str) -> collections.abc.Iterable[Token]: for i, char in enumerate(text): - if ord('\u3400') <= ord(char) <= ord('\u4dbf') or ord('\u4e00') <= ord(char) <= ord('\u9faf'): + if ord("\u3400") <= ord(char) <= ord("\u4dbf") or ord("\u4e00") <= ord(char) <= ord("\u9faf"): yield Token( start=i, end=i + 1, @@ -22,7 +22,7 @@ class JapaneseKanjiTokenizer(Tokenizer): class JapaneseWordTokenizer(Tokenizer): def __init__(self): - self._tokenizer_obj = dictionary.Dictionary(dict_type='core').create() + self._tokenizer_obj = dictionary.Dictionary(dict_type="core").create() self._mode = tokenizer.Tokenizer.SplitMode.B def tokenize(self, text: str) -> collections.abc.Iterable[Token]: diff --git a/tests/readability/test_flesch_reading_ease.py b/tests/readability/test_flesch_reading_ease.py @@ -3,12 +3,15 @@ from nlparrot.readability.flesch_reading_ease import croatian def test_croatian(): # must be in category "simple" - assert croatian('To je malo.') >= 80 + assert croatian("To je malo.") >= 80 # from https://hr.wikipedia.org/wiki/Osijek (license: CC-BY-SA 3.0) # expect "difficult" - assert croatian( - '''Najveći je grad u Slavoniji, četvrti po veličini grad u Hrvatskoj, + assert ( + croatian( + """Najveći je grad u Slavoniji, četvrti po veličini grad u Hrvatskoj, te je industrijsko, upravno, akademsko, sudsko i kulturno središte - Osječko-baranjske županije.''' - ) < 50 + Osječko-baranjske županije.""" + ) + < 50 + ) diff --git a/tests/test_detection.py b/tests/test_detection.py @@ -4,11 +4,11 @@ from nlparrot.detection import detect_language @pytest.mark.parametrize( - ['text', 'expected_language'], + ["text", "expected_language"], [ - ('This is an English sentence.', 'en'), - ('Esta es una frase en español.', 'es'), - ] + ("This is an English sentence.", "en"), + ("Esta es una frase en español.", "es"), + ], ) def test_detect_language(text, expected_language): - assert detect_language(text, supported_languages=['en', 'es', 'hr', 'ja']) == expected_language + assert detect_language(text, supported_languages=["en", "es", "hr", "ja"]) == expected_language diff --git a/tests/tokenization/test_croatian.py b/tests/tokenization/test_croatian.py @@ -9,12 +9,12 @@ def test_croatian_vocabulary_tokenizer_keeps_whitespace(): # classla re-sets the char counter on each newline, and trims # each line. We also count newline chars and whitespace. - result = list(tokenizer.tokenize(' Učim.\n\n\n Svijet je prepun prilika.')) + result = list(tokenizer.tokenize(" Učim.\n\n\n Svijet je prepun prilika.")) assert result == [ - Token(start=1, end=5, token='učiti', original_text='Učim'), - Token(start=12, end=18, token='svijet', original_text='Svijet'), - Token(start=19, end=21, token='biti', original_text='je'), - Token(start=22, end=28, token='prepun', original_text='prepun'), - Token(start=29, end=36, token='prilika', original_text='prilika'), + Token(start=1, end=5, token="učiti", original_text="Učim"), + Token(start=12, end=18, token="svijet", original_text="Svijet"), + Token(start=19, end=21, token="biti", original_text="je"), + Token(start=22, end=28, token="prepun", original_text="prepun"), + Token(start=29, end=36, token="prilika", original_text="prilika"), ] diff --git a/tests/tokenization/test_generic.py b/tests/tokenization/test_generic.py @@ -2,5 +2,5 @@ from nlparrot.tokenization.generic import Token def test_token_serialization(): - token = Token(start=10, end=12, token='test', original_text='tests') + token = Token(start=10, end=12, token="test", original_text="tests") assert Token.from_dict(token.to_dict()) == token diff --git a/tests/tokenization/test_japanese.py b/tests/tokenization/test_japanese.py @@ -7,19 +7,19 @@ from nlparrot.tokenization.japanese import ( def test_japanese_kanji_tokenizer(): tokenizer = JapaneseKanjiTokenizer() - result = list(tokenizer.tokenize('米を食べたい')) + result = list(tokenizer.tokenize("米を食べたい")) assert result == [ - Token(start=0, end=1, token='米', original_text='米'), - Token(start=2, end=3, token='食', original_text='食'), + Token(start=0, end=1, token="米", original_text="米"), + Token(start=2, end=3, token="食", original_text="食"), ] def test_japanese_word_tokenizer(): tokenizer = JapaneseWordTokenizer() - result = list(tokenizer.tokenize('米を食べたい')) + result = list(tokenizer.tokenize("米を食べたい")) assert result == [ - Token(start=0, end=1, token='米', original_text='米'), - Token(start=1, end=2, token='を', original_text='を'), - Token(start=2, end=4, token='食べる', original_text='食べ'), - Token(start=4, end=6, token='たい', original_text='たい'), + Token(start=0, end=1, token="米", original_text="米"), + Token(start=1, end=2, token="を", original_text="を"), + Token(start=2, end=4, token="食べる", original_text="食べ"), + Token(start=4, end=6, token="たい", original_text="たい"), ]