From 9f05e7b8334142c781e27789891c052243c6c679 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 18 Jul 2023 20:08:28 -0700 Subject: [PATCH] Just one call to batch.doc.get and reusing the results should be slightly faster --- stanza/pipeline/lemma_processor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/stanza/pipeline/lemma_processor.py b/stanza/pipeline/lemma_processor.py index 9faf5fec91..86e4708723 100644 --- a/stanza/pipeline/lemma_processor.py +++ b/stanza/pipeline/lemma_processor.py @@ -8,6 +8,8 @@ from stanza.pipeline._constants import * from stanza.pipeline.processor import UDProcessor, register_processor +WORD_TAGS = [doc.TEXT, doc.UPOS] + @register_processor(name=LEMMA) class LemmaProcessor(UDProcessor): @@ -74,7 +76,9 @@ def process(self, document): edits += es if self.config.get('ensemble_dict', False): - preds = self.trainer.postprocess([x for x, y in zip(batch.doc.get([doc.TEXT]), skip) if not y], preds, edits=edits) + word_tags = batch.doc.get(WORD_TAGS) + words = [x[0] for x in word_tags] + preds = self.trainer.postprocess([x for x, y in zip(words, skip) if not y], preds, edits=edits) # expand seq2seq predictions to the same size as all words i = 0 preds1 = [] @@ -84,7 +88,7 @@ def process(self, document): else: preds1.append(preds[i]) i += 1 - preds = self.trainer.ensemble(batch.doc.get([doc.TEXT, doc.UPOS]), preds1) + preds = self.trainer.ensemble(word_tags, preds1) else: preds = self.trainer.postprocess(batch.doc.get([doc.TEXT]), preds, edits=edits)