nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit 0dca8ae17a13df8b1c7e075239d8ad4ee5fb6581
parent 108a43603bfb9165152896c81b771f96011de7fb
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Mon,  1 Jan 2024 11:11:48 +0100

add Finnish support

Diffstat:
Msrc/nlparrot/cli.py | 8++++----
Msrc/nlparrot/tokenization/factory.py | 2+-
Msrc/nlparrot/tokenization/generic.py | 1+
Atests/tokenization/test_finnish.py | 12++++++++++++
4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/nlparrot/cli.py b/src/nlparrot/cli.py @@ -12,12 +12,12 @@ def cli(): @cli.command() def download_models(): - click.echo('Downloading stanza models') + click.echo("Downloading stanza models") for model in ["multilingual", "en", "es", "hr"]: stanza.download(model) - click.echo('Downloading spacy models') - spacy.cli.download('hr_core_news_sm') + click.echo("Downloading spacy models") + spacy.cli.download("hr_core_news_sm") @cli.command() @@ -25,5 +25,5 @@ def run(): run_server() -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/src/nlparrot/tokenization/factory.py b/src/nlparrot/tokenization/factory.py @@ -2,10 +2,10 @@ from .croatian import CroatianVocabularyTokenizer from .generic import StanzaVocabularyTokenizer from .japanese import JapaneseKanjiTokenizer, JapaneseWordTokenizer - GENERIC_TOKENIZERS = [ "en", "es", + "fi", ] diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py @@ -53,6 +53,7 @@ class StanzaVocabularyTokenizer(Tokenizer): char_offset += lstripped_count + # TODO: Might doc.iter_tokens() be better? for sentence in doc.sentences: for token in sentence.tokens: data = token.to_dict()[0] diff --git a/tests/tokenization/test_finnish.py b/tests/tokenization/test_finnish.py @@ -0,0 +1,12 @@ +from nlparrot.tokenization.generic import StanzaVocabularyTokenizer, Token + + +def test_finnish_tokenizer(): + tokenizer = StanzaVocabularyTokenizer("fi") + + # spaces and newlines must also be counted + result = list(tokenizer.tokenize(" Rakkautta ja valheita \n Ja pieni sydän täynnä lasin sirpaleita")) + + assert Token(start=2, end=11, token="rakkaus", original_text="Rakkautta") in result + assert Token(start=27, end=29, token="ja", original_text="Ja") in result + assert Token(start=36, end=41, token="sydän", original_text="sydän") in result