commit 3158f9f15349ff7cfa0838714511f8c6c930b81f
parent 79e6b82f7fcee0f9b09ef32af27fb3835041015b
Author: Stefan Koch <programming@stefan-koch.name>
Date: Sat, 5 Aug 2023 14:01:07 +0200
apply black formatting
Diffstat:
12 files changed, 72 insertions(+), 67 deletions(-)
diff --git a/src/nlparrot/detection.py b/src/nlparrot/detection.py
@@ -4,8 +4,8 @@ import stanza
def detect_language(text: str, supported_languages: list[str]) -> str:
# TODO: Cache pipeline
nlp = stanza.Pipeline(
- lang='multilingual',
- processors='langid',
+ lang="multilingual",
+ processors="langid",
# restrict to supported languages
langid_lang_subset=supported_languages,
# TODO: Add a DEV mode in which we use DownloadMethod.DOWNLOAD_RESOURCES
diff --git a/src/nlparrot/readability/flesch_reading_ease.py b/src/nlparrot/readability/flesch_reading_ease.py
@@ -11,7 +11,7 @@ def croatian(text: str):
total_sentences = _sentence_count(doc)
total_words = _word_count(doc)
- dic = pyphen.Pyphen(lang='hr')
+ dic = pyphen.Pyphen(lang="hr")
total_syllables = _syllable_count(doc, dic)
word_length = total_syllables / total_words
diff --git a/src/nlparrot/server.py b/src/nlparrot/server.py
@@ -7,37 +7,37 @@ from nlparrot.readability import flesch_reading_ease
def get_listen_address():
- host = os.getenv('NLPARROT_LISTEN_HOST')
- port = os.getenv('NLPARROT_LISTEN_PORT')
- socket_path = os.getenv('NLPARROT_SOCKET_PATH')
+ host = os.getenv("NLPARROT_LISTEN_HOST")
+ port = os.getenv("NLPARROT_LISTEN_PORT")
+ socket_path = os.getenv("NLPARROT_SOCKET_PATH")
if host and port:
- return {'address': (host, int(port)), 'family': 'AF_INET'}
+ return {"address": (host, int(port)), "family": "AF_INET"}
else:
- return {'address': socket_path, 'family': 'AF_UNIX'}
+ return {"address": socket_path, "family": "AF_UNIX"}
-if __name__ == '__main__':
- print('Starting server ...')
+if __name__ == "__main__":
+ print("Starting server ...")
current_language = None
tokenizers = None
# TODO: Receive supported languages from client program
- supported_languages = ['hr', 'ja']
+ supported_languages = ["hr", "ja"]
with Listener(**get_listen_address()) as listener:
- print('Listening for connection ...')
+ print("Listening for connection ...")
while True:
with listener.accept() as conn:
task: dict = conn.recv()
- if task['language'] is None:
- language = detect_language(task['text'], supported_languages=supported_languages)
+ if task["language"] is None:
+ language = detect_language(task["text"], supported_languages=supported_languages)
print(f'Auto-detected language "{language}"')
else:
- language = task['language']
+ language = task["language"]
print(f'Received language "{language}"')
if tokenizers is None or current_language != language:
@@ -45,17 +45,19 @@ if __name__ == '__main__':
current_language = language
# TODO: Generic way
- if language == 'hr':
- reading_ease = flesch_reading_ease.croatian(task['text'])
+ if language == "hr":
+ reading_ease = flesch_reading_ease.croatian(task["text"])
else:
reading_ease = None
tokens = {}
for tokenizer_type, tokenizer in tokenizers.items():
- tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])]
-
- conn.send({
- 'language': language,
- 'tokens': tokens,
- 'flesch_reading_ease': reading_ease,
- })
+ tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task["text"])]
+
+ conn.send(
+ {
+ "language": language,
+ "tokens": tokens,
+ "flesch_reading_ease": reading_ease,
+ }
+ )
diff --git a/src/nlparrot/tokenization/croatian.py b/src/nlparrot/tokenization/croatian.py
@@ -20,7 +20,7 @@ class CroatianVocabularyTokenizer(Tokenizer):
char_offset = 0
nlp = self._get_stanza()
- for paragraph in text.split('\n'):
+ for paragraph in text.split("\n"):
lstripped = paragraph.lstrip()
lstripped_count = len(paragraph) - len(lstripped)
stripped = lstripped.rstrip()
@@ -34,12 +34,12 @@ class CroatianVocabularyTokenizer(Tokenizer):
for token in sentence.tokens:
data = token.to_dict()[0]
- if data['upos'].lower() not in ['punct', 'sym']:
+ if data["upos"].lower() not in ["punct", "sym"]:
yield Token(
start=token.start_char + char_offset,
end=token.end_char + char_offset,
- token=data['lemma'],
- original_text=data['text'],
+ token=data["lemma"],
+ original_text=data["text"],
)
char_offset += len(stripped)
@@ -48,14 +48,14 @@ class CroatianVocabularyTokenizer(Tokenizer):
char_offset += 1
def _get_stanza(self):
- type_ = 'nonstandard' if self._nonstandard else 'default'
+ type_ = "nonstandard" if self._nonstandard else "default"
if self._stanza is None:
# TODO: For some reason suddenly I do not get a "start_char" anymore
# from classla. Using original stanza for the time being.
self._stanza = stanza.Pipeline(
- 'hr',
- processors='tokenize,pos,lemma,depparse',
+ "hr",
+ processors="tokenize,pos,lemma,depparse",
# TODO: Add a DEV mode in which we use DownloadMethod.DOWNLOAD_RESOURCES
download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)
diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py
@@ -6,14 +6,14 @@ from .japanese import (
def get_tokenizers(language_code: str):
- if language_code.lower() == 'hr':
+ if language_code.lower() == "hr":
return {
- 'vocabulary': CroatianVocabularyTokenizer(),
+ "vocabulary": CroatianVocabularyTokenizer(),
}
- if language_code.lower() == 'ja':
+ if language_code.lower() == "ja":
return {
- 'kanji': JapaneseKanjiTokenizer(),
- 'vocabulary': JapaneseWordTokenizer(),
+ "kanji": JapaneseKanjiTokenizer(),
+ "vocabulary": JapaneseWordTokenizer(),
}
return None
diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py
@@ -12,10 +12,10 @@ class Token:
def to_dict(self):
return {
- 'start': self.start,
- 'end': self.end,
- 'token': self.token,
- 'original_text': self.original_text,
+ "start": self.start,
+ "end": self.end,
+ "token": self.token,
+ "original_text": self.original_text,
}
@classmethod
diff --git a/src/nlparrot/tokenization/japanese.py b/src/nlparrot/tokenization/japanese.py
@@ -11,7 +11,7 @@ from .generic import (
class JapaneseKanjiTokenizer(Tokenizer):
def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
for i, char in enumerate(text):
- if ord('\u3400') <= ord(char) <= ord('\u4dbf') or ord('\u4e00') <= ord(char) <= ord('\u9faf'):
+ if ord("\u3400") <= ord(char) <= ord("\u4dbf") or ord("\u4e00") <= ord(char) <= ord("\u9faf"):
yield Token(
start=i,
end=i + 1,
@@ -22,7 +22,7 @@ class JapaneseKanjiTokenizer(Tokenizer):
class JapaneseWordTokenizer(Tokenizer):
def __init__(self):
- self._tokenizer_obj = dictionary.Dictionary(dict_type='core').create()
+ self._tokenizer_obj = dictionary.Dictionary(dict_type="core").create()
self._mode = tokenizer.Tokenizer.SplitMode.B
def tokenize(self, text: str) -> collections.abc.Iterable[Token]:
diff --git a/tests/readability/test_flesch_reading_ease.py b/tests/readability/test_flesch_reading_ease.py
@@ -3,12 +3,15 @@ from nlparrot.readability.flesch_reading_ease import croatian
def test_croatian():
# must be in category "simple"
- assert croatian('To je malo.') >= 80
+ assert croatian("To je malo.") >= 80
# from https://hr.wikipedia.org/wiki/Osijek (license: CC-BY-SA 3.0)
# expect "difficult"
- assert croatian(
- '''Najveći je grad u Slavoniji, četvrti po veličini grad u Hrvatskoj,
+ assert (
+ croatian(
+ """Najveći je grad u Slavoniji, četvrti po veličini grad u Hrvatskoj,
te je industrijsko, upravno, akademsko, sudsko i kulturno središte
- Osječko-baranjske županije.'''
- ) < 50
+ Osječko-baranjske županije."""
+ )
+ < 50
+ )
diff --git a/tests/test_detection.py b/tests/test_detection.py
@@ -4,11 +4,11 @@ from nlparrot.detection import detect_language
@pytest.mark.parametrize(
- ['text', 'expected_language'],
+ ["text", "expected_language"],
[
- ('This is an English sentence.', 'en'),
- ('Esta es una frase en español.', 'es'),
- ]
+ ("This is an English sentence.", "en"),
+ ("Esta es una frase en español.", "es"),
+ ],
)
def test_detect_language(text, expected_language):
- assert detect_language(text, supported_languages=['en', 'es', 'hr', 'ja']) == expected_language
+ assert detect_language(text, supported_languages=["en", "es", "hr", "ja"]) == expected_language
diff --git a/tests/tokenization/test_croatian.py b/tests/tokenization/test_croatian.py
@@ -9,12 +9,12 @@ def test_croatian_vocabulary_tokenizer_keeps_whitespace():
# classla re-sets the char counter on each newline, and trims
# each line. We also count newline chars and whitespace.
- result = list(tokenizer.tokenize(' Učim.\n\n\n Svijet je prepun prilika.'))
+ result = list(tokenizer.tokenize(" Učim.\n\n\n Svijet je prepun prilika."))
assert result == [
- Token(start=1, end=5, token='učiti', original_text='Učim'),
- Token(start=12, end=18, token='svijet', original_text='Svijet'),
- Token(start=19, end=21, token='biti', original_text='je'),
- Token(start=22, end=28, token='prepun', original_text='prepun'),
- Token(start=29, end=36, token='prilika', original_text='prilika'),
+ Token(start=1, end=5, token="učiti", original_text="Učim"),
+ Token(start=12, end=18, token="svijet", original_text="Svijet"),
+ Token(start=19, end=21, token="biti", original_text="je"),
+ Token(start=22, end=28, token="prepun", original_text="prepun"),
+ Token(start=29, end=36, token="prilika", original_text="prilika"),
]
diff --git a/tests/tokenization/test_generic.py b/tests/tokenization/test_generic.py
@@ -2,5 +2,5 @@ from nlparrot.tokenization.generic import Token
def test_token_serialization():
- token = Token(start=10, end=12, token='test', original_text='tests')
+ token = Token(start=10, end=12, token="test", original_text="tests")
assert Token.from_dict(token.to_dict()) == token
diff --git a/tests/tokenization/test_japanese.py b/tests/tokenization/test_japanese.py
@@ -7,19 +7,19 @@ from nlparrot.tokenization.japanese import (
def test_japanese_kanji_tokenizer():
tokenizer = JapaneseKanjiTokenizer()
- result = list(tokenizer.tokenize('米を食べたい'))
+ result = list(tokenizer.tokenize("米を食べたい"))
assert result == [
- Token(start=0, end=1, token='米', original_text='米'),
- Token(start=2, end=3, token='食', original_text='食'),
+ Token(start=0, end=1, token="米", original_text="米"),
+ Token(start=2, end=3, token="食", original_text="食"),
]
def test_japanese_word_tokenizer():
tokenizer = JapaneseWordTokenizer()
- result = list(tokenizer.tokenize('米を食べたい'))
+ result = list(tokenizer.tokenize("米を食べたい"))
assert result == [
- Token(start=0, end=1, token='米', original_text='米'),
- Token(start=1, end=2, token='を', original_text='を'),
- Token(start=2, end=4, token='食べる', original_text='食べ'),
- Token(start=4, end=6, token='たい', original_text='たい'),
+ Token(start=0, end=1, token="米", original_text="米"),
+ Token(start=1, end=2, token="を", original_text="を"),
+ Token(start=2, end=4, token="食べる", original_text="食べ"),
+ Token(start=4, end=6, token="たい", original_text="たい"),
]