nlparrot

natural language processing server
Log | Files | Refs | README | LICENSE

commit 36859fa1339dec3e045a4c55d1ab46e24532bd94
parent 22e6e7751e53631b48ba931e28bc9f674245e657
Author: Stefan Koch <programming@stefan-koch.name>
Date:   Sun,  3 Sep 2023 12:46:56 +0200

handle missing upos attribute

Diffstat:
Msrc/nlparrot/tokenization/generic.py | 2+-
Mtests/tokenization/test_spanish.py | 10++++++++++
2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py @@ -57,7 +57,7 @@ class StanzaVocabularyTokenizer(Tokenizer): for token in sentence.tokens: data = token.to_dict()[0] - if data["upos"].lower() not in ["punct", "sym"]: + if "upos" in data and data["upos"].lower() not in ["punct", "sym"]: yield Token( start=token.start_char + char_offset, end=token.end_char + char_offset, diff --git a/tests/tokenization/test_spanish.py b/tests/tokenization/test_spanish.py @@ -16,3 +16,13 @@ def test_spanish_vocabulary_tokenizer_keeps_whitespace(): assert Token(start=34, end=46, token="enciclopedia", original_text="enciclopedia") in result assert Token(start=73, end=78, token="todo", original_text="todos") in result assert Token(start=79, end=85, token="poder", original_text="pueden") in result + + +def test_spanish_handles_missing_upos(): + tokenizer = StanzaVocabularyTokenizer("es") + + # in the following sentence "del" might not have a "upos" attribute + # (it does not have one in stanza 1.5.0) + result = list(tokenizer.tokenize("del grano del Mar Negro.")) + # analysis should not completely fail + assert len(result) > 0