nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit 7c0cdc59a5dc9018b3f209a27e0ce3ef76cef822
parent d0275885c00aba892a0ee00e4d151fdd026b319d
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Sat, 14 Oct 2023 11:24:48 +0200

implement English support

Diffstat:
Msrc/nlparrot/tokenization/factory.py | 10++++++++--
Atests/tokenization/test_english.py | 17+++++++++++++++++
2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py @@ -3,10 +3,16 @@ from .generic import StanzaVocabularyTokenizer from .japanese import JapaneseKanjiTokenizer, JapaneseWordTokenizer +GENERIC_TOKENIZERS = [ + 'en', + 'es', +] + + def get_tokenizers(language_code: str): - if language_code.lower() == "es": + if language_code.lower() in GENERIC_TOKENIZERS: return { - "vocabulary": StanzaVocabularyTokenizer("es"), + "vocabulary": StanzaVocabularyTokenizer(language_code.lower()), } if language_code.lower() == "hr": return { diff --git a/tests/tokenization/test_english.py b/tests/tokenization/test_english.py @@ -0,0 +1,17 @@ +from nlparrot.tokenization.generic import StanzaVocabularyTokenizer, Token + + +def test_english_vocabulary_tokenizer_keeps_whitespace(): + tokenizer = StanzaVocabularyTokenizer("en") + + # classla re-sets the char counter on each newline, and trims + # each line. We also count newline chars and whitespace. + result = list( + tokenizer.tokenize( + " All legislative Powers herein granted \n shall be vested in a Congress of the United States" + ) + ) + + assert Token(start=2, end=5, token="all", original_text="All") in result + assert Token(start=32, end=39, token="grant", original_text="granted") in result + assert Token(start=43, end=48, token="shall", original_text="shall") in result