commit 0dca8ae17a13df8b1c7e075239d8ad4ee5fb6581
parent 108a43603bfb9165152896c81b771f96011de7fb
Author: Stefan Koch <programming@stefan-koch.name>
Date: Mon, 1 Jan 2024 11:11:48 +0100
add Finnish support
Diffstat:
4 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/src/nlparrot/cli.py b/src/nlparrot/cli.py
@@ -12,12 +12,12 @@ def cli():
@cli.command()
def download_models():
- click.echo('Downloading stanza models')
+ click.echo("Downloading stanza models")
for model in ["multilingual", "en", "es", "hr"]:
stanza.download(model)
- click.echo('Downloading spacy models')
- spacy.cli.download('hr_core_news_sm')
+ click.echo("Downloading spacy models")
+ spacy.cli.download("hr_core_news_sm")
@cli.command()
@@ -25,5 +25,5 @@ def run():
run_server()
-if __name__ == '__main__':
+if __name__ == "__main__":
cli()
diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py
@@ -2,10 +2,10 @@ from .croatian import CroatianVocabularyTokenizer
from .generic import StanzaVocabularyTokenizer
from .japanese import JapaneseKanjiTokenizer, JapaneseWordTokenizer
-
GENERIC_TOKENIZERS = [
"en",
"es",
+ "fi",
]
diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py
@@ -53,6 +53,7 @@ class StanzaVocabularyTokenizer(Tokenizer):
char_offset += lstripped_count
+ # TODO: Might doc.iter_tokens() be better?
for sentence in doc.sentences:
for token in sentence.tokens:
data = token.to_dict()[0]
diff --git a/tests/tokenization/test_finnish.py b/tests/tokenization/test_finnish.py
@@ -0,0 +1,12 @@
+from nlparrot.tokenization.generic import StanzaVocabularyTokenizer, Token
+
+
+def test_finnish_tokenizer():
+ tokenizer = StanzaVocabularyTokenizer("fi")
+
+ # spaces and newlines must also be counted
+ result = list(tokenizer.tokenize(" Rakkautta ja valheita \n Ja pieni sydän täynnä lasin sirpaleita"))
+
+ assert Token(start=2, end=11, token="rakkaus", original_text="Rakkautta") in result
+ assert Token(start=27, end=29, token="ja", original_text="Ja") in result
+ assert Token(start=36, end=41, token="sydän", original_text="sydän") in result