commit b320b144f884c1b3a761a5d1d9cfe9b89f28927a
parent 56ff9c522a06af9dc9f7795ef2df4fc7c923ac40
Author: Stefan Koch <programming@stefan-koch.name>
Date: Wed, 12 Jul 2023 20:02:33 +0200
support multiple tokenizers per language
Diffstat:
2 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/src/alphadistill/server.py b/src/alphadistill/server.py
@@ -2,7 +2,7 @@ from multiprocessing.connection import Listener
import os
from alphadistill.detection import detect_language
-from alphadistill.tokenization.factory import get_tokenizer
+from alphadistill.tokenization.factory import get_tokenizers
from alphadistill.readability import flesch_reading_ease
@@ -20,7 +20,7 @@ if __name__ == '__main__':
print('Starting server ...')
current_language = None
- tokenizer = None
+ tokenizers = None
# TODO: Receive supported languages from client program
supported_languages = ['hr', 'ja']
@@ -39,8 +39,8 @@ if __name__ == '__main__':
language = task['language']
print(f'Received language "{language}"')
- if tokenizer is None or current_language != language:
- tokenizer = get_tokenizer(language)
+ if tokenizers is None or current_language != language:
+ tokenizers = get_tokenizers(language)
current_language = language
# TODO: Generic way
@@ -49,8 +49,12 @@ if __name__ == '__main__':
else:
reading_ease = None
+ tokens = {}
+ for tokenizer_type, tokenizer in tokenizers.items():
+ tokens[tokenizer_type] = [token.to_dict() for token in tokenizer.tokenize(task['text'])]
+
conn.send({
'language': language,
- 'tokens': [token.to_dict() for token in tokenizer.tokenize(task['text'])],
+ 'tokens': tokens,
'flesch_reading_ease': reading_ease,
})
diff --git a/src/alphadistill/tokenization/factory.py b/src/alphadistill/tokenization/factory.py
@@ -1,11 +1,19 @@
from .croatian import CroatianVocabularyTokenizer
-from .japanese import JapaneseKanjiTokenizer
+from .japanese import (
+ JapaneseKanjiTokenizer,
+ JapaneseWordTokenizer,
+)
-def get_tokenizer(language_code: str):
+def get_tokenizers(language_code: str):
if language_code.lower() == 'hr':
- return CroatianVocabularyTokenizer()
+ return {
+ 'vocabulary': CroatianVocabularyTokenizer(),
+ }
if language_code.lower() == 'ja':
- return JapaneseKanjiTokenizer()
+ return {
+ 'kanji': JapaneseKanjiTokenizer(),
+ 'vocabulary': JapaneseWordTokenizer(),
+ }
return None