commit 7c0cdc59a5dc9018b3f209a27e0ce3ef76cef822
parent d0275885c00aba892a0ee00e4d151fdd026b319d
Author: Stefan Koch <programming@stefan-koch.name>
Date: Sat, 14 Oct 2023 11:24:48 +0200
implement English support
Diffstat:
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py
@@ -3,10 +3,16 @@ from .generic import StanzaVocabularyTokenizer
from .japanese import JapaneseKanjiTokenizer, JapaneseWordTokenizer
+GENERIC_TOKENIZERS = [
+ 'en',
+ 'es',
+]
+
+
def get_tokenizers(language_code: str):
- if language_code.lower() == "es":
+ if language_code.lower() in GENERIC_TOKENIZERS:
return {
- "vocabulary": StanzaVocabularyTokenizer("es"),
+ "vocabulary": StanzaVocabularyTokenizer(language_code.lower()),
}
if language_code.lower() == "hr":
return {
diff --git a/tests/tokenization/test_english.py b/tests/tokenization/test_english.py
@@ -0,0 +1,17 @@
+from nlparrot.tokenization.generic import StanzaVocabularyTokenizer, Token
+
+
+def test_english_vocabulary_tokenizer_keeps_whitespace():
+ tokenizer = StanzaVocabularyTokenizer("en")
+
+ # classla re-sets the char counter on each newline, and trims
+ # each line. We also count newline chars and whitespace.
+ result = list(
+ tokenizer.tokenize(
+ " All legislative Powers herein granted \n shall be vested in a Congress of the United States"
+ )
+ )
+
+ assert Token(start=2, end=5, token="all", original_text="All") in result
+ assert Token(start=32, end=39, token="grant", original_text="granted") in result
+ assert Token(start=43, end=48, token="shall", original_text="shall") in result