commit 36859fa1339dec3e045a4c55d1ab46e24532bd94
parent 22e6e7751e53631b48ba931e28bc9f674245e657
Author: Stefan Koch <programming@stefan-koch.name>
Date: Sun, 3 Sep 2023 12:46:56 +0200
handle missing upos attribute
Diffstat:
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/nlparrot/tokenization/generic.py b/src/nlparrot/tokenization/generic.py
@@ -57,7 +57,7 @@ class StanzaVocabularyTokenizer(Tokenizer):
for token in sentence.tokens:
data = token.to_dict()[0]
- if data["upos"].lower() not in ["punct", "sym"]:
+ if "upos" in data and data["upos"].lower() not in ["punct", "sym"]:
yield Token(
start=token.start_char + char_offset,
end=token.end_char + char_offset,
diff --git a/tests/tokenization/test_spanish.py b/tests/tokenization/test_spanish.py
@@ -16,3 +16,13 @@ def test_spanish_vocabulary_tokenizer_keeps_whitespace():
assert Token(start=34, end=46, token="enciclopedia", original_text="enciclopedia") in result
assert Token(start=73, end=78, token="todo", original_text="todos") in result
assert Token(start=79, end=85, token="poder", original_text="pueden") in result
+
+
+def test_spanish_handles_missing_upos():
+ tokenizer = StanzaVocabularyTokenizer("es")
+
+ # in the following sentence "del" might not have a "upos" attribute
+ # (it does not have one in stanza 1.5.0)
+ result = list(tokenizer.tokenize("del grano del Mar Negro."))
+ # analysis should not completely fail
+ assert len(result) > 0