nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit b320b144f884c1b3a761a5d1d9cfe9b89f28927a
parent 56ff9c522a06af9dc9f7795ef2df4fc7c923ac40
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Wed, 12 Jul 2023 20:02:33 +0200

support multiple tokenizers per language

Diffstat:
Msrc/alphadistill/server.py | 14+++++++++-----
Msrc/alphadistill/tokenization/factory.py | 16++++++++++++----
2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/alphadistill/server.py b/src/alphadistill/server.py @@ -2,7 +2,7 @@ from multiprocessing.connection import Listener import os from alphadistill.detection import detect_language -from alphadistill.tokenization.factory import get_tokenizer +from alphadistill.tokenization.factory import get_tokenizers from alphadistill.readability import flesch_reading_ease @@ -20,7 +20,7 @@ if __name__ == '__main__': print('Starting server ...') current_language = None - tokenizer = None + tokenizers = None # TODO: Receive supported languages from client program supported_languages = ['hr', 'ja'] @@ -39,8 +39,8 @@ if __name__ == '__main__': language = task['language'] print(f'Received language "{language}"') - if tokenizer is None or current_language != language: - tokenizer = get_tokenizer(language) + if tokenizers is None or current_language != language: + tokenizers = get_tokenizers(language) current_language = language # TODO: Generic way @@ -49,8 +49,12 @@ if __name__ == '__main__': else: reading_ease = None + tokens = {} + for tokenizer_type, tokenizer in tokenizers.items(): + tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])] + conn.send({ 'language': language, - 'tokens': [token.to_dict() for token in tokenizer.tokenize(task['text'])], + 'tokens': tokens, 'flesch_reading_ease': reading_ease, }) diff --git a/src/alphadistill/tokenization/factory.py b/src/alphadistill/tokenization/factory.py @@ -1,11 +1,19 @@ from .croatian import CroatianVocabularyTokenizer -from .japanese import JapaneseKanjiTokenizer +from .japanese import ( + JapaneseKanjiTokenizer, + JapaneseWordTokenizer, +) -def get_tokenizer(language_code: str): +def get_tokenizers(language_code: str): if language_code.lower() == 'hr': - return CroatianVocabularyTokenizer() + return { + 'vocabulary': CroatianVocabularyTokenizer(), + } if language_code.lower() == 'ja': - return JapaneseKanjiTokenizer() + return { + 'kanji': JapaneseKanjiTokenizer(), + 'vocabulary': JapaneseWordTokenizer(), + } return None