Add tags to Text object, move ignore_punct out of reader functions

tsproisl · Dec 9, 2021 · da77021 · da77021
1 parent e1b06ca
commit da77021
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 25 deletions.
diff --git a/textcomplexity/cli.py b/textcomplexity/cli.py
@@ -147,18 +147,20 @@ def main():
     punct_tags = set(punct_tags)
     all_results = {}
     for i, f in enumerate(args.TEXT):
-        tokens, tagged, graphs, ps_trees = None, None, None, None
+        tokens, sentences, graphs, ps_trees = None, None, None, None
         if args.input_format == "conllu":
-            tokens, tagged, graphs = zip(*conllu.read_conllu_sentences(f, ignore_punct=args.ignore_punct, punct_tags=punct_tags))
-            tokens = list(itertools.chain.from_iterable(tokens))
+            sentences, graphs = zip(*conllu.read_conllu_sentences(f))
+            tokens = list(itertools.chain.from_iterable(sentences))
         elif args.input_format == "tsv":
-            tokens, tagged, graphs, ps_trees = zip(*custom_tsv.read_tsv_sentences(f, ignore_punct=args.ignore_punct, punct_tags=punct_tags))
-            tokens = list(itertools.chain.from_iterable(tokens))
+            sentences, graphs, ps_trees = zip(*custom_tsv.read_tsv_sentences(f))
+            tokens = list(itertools.chain.from_iterable(sentences))
+        if args.ignore_punct and tokens is not None:
+            tokens = [t for t in tokens if t.pos not in punct_tags]
         results = []
         if args.sur and tokens is not None:
             results.extend(surface_based(tokens, args.window_size, args.all_measures))
-        if args.sent and tagged is not None:
-            results.extend(sentence_based(tagged, punct_tags))
+        if args.sent and sentences is not None:
+            results.extend(sentence_based(sentences, punct_tags))
         if args.dep and graphs is not None:
             results.extend(dependency_based(graphs))
         if args.const and ps_trees is not None:

diff --git a/textcomplexity/text.py b/textcomplexity/text.py
@@ -5,18 +5,24 @@
 
 class Text:
 
-    def __init__(self, tokens, text_length, vocabulary_size, frequency_list, frequency_spectrum):
+    def __init__(self, tokens, tags, text_length, vocabulary_size, frequency_list, frequency_spectrum):
         self.tokens = tokens
+        self.tags = tags
         self.text_length = text_length
         self.vocabulary_size = vocabulary_size
         self.frequency_list = frequency_list
         self.frequency_spectrum = frequency_spectrum
 
     @classmethod
     def from_tokens(cls, tokens):
-        """Create Text object from iterable of tokens."""
+        """Create Text object from iterable of tokens, i.e. named tuples
+        (word, pos).
+
+        """
+        toks = [t.word for t in tokens]
+        tags = [t.pos for t in tokens]
         text_length = len(tokens)
-        frequency_list = collections.Counter(tokens)
+        frequency_list = collections.Counter(toks)
         vocabulary_size = len(frequency_list)
         frequency_spectrum = dict(collections.Counter(frequency_list.values()))
-        return cls(tokens, text_length, vocabulary_size, frequency_list, frequency_spectrum)
+        return cls(toks, tags, text_length, vocabulary_size, frequency_list, frequency_spectrum)
diff --git a/textcomplexity/utils/conllu.py b/textcomplexity/utils/conllu.py
@@ -9,20 +9,17 @@
 from textcomplexity.utils import graph
 
 UdToken = collections.namedtuple("UdToken", "id form lemma upos xpos feats head deprel deps misc".split())
-Token = collections.namedtuple("Token", "word pos".split())
+Token = collections.namedtuple("Token", "word pos upos".split())
 
 
-def read_conllu_sentences(f, *, ignore_punct=False, punct_tags=None, warnings=True):
+def read_conllu_sentences(f, *, warnings=True):
     for sentence, sent_id in _read_conllu(f):
         tokens = _get_tokens(sentence)
-        if ignore_punct:
-            tokens = [t for t in tokens if t.upos not in punct_tags]
-        forms = [t.form for t in tokens]
-        tokens = [Token(t.form, t.upos) for t in tokens]
+        tokens = [Token(t.form, t.xpos, t.upos) for t in tokens]
         g = _create_nx_digraph(sentence, sent_id)
         sensible, explanation = graph.is_sensible_graph(g)
         if sensible:
-            yield forms, tokens, g
+            yield tokens, g
         else:
             if warnings:
                 logging.warn("Ignoring sentence with ID %s: %s" % (sent_id, explanation))

diff --git a/textcomplexity/utils/custom_tsv.py b/textcomplexity/utils/custom_tsv.py
@@ -12,7 +12,7 @@
 Token = collections.namedtuple("Token", "word pos".split())
 
 
-def read_tsv_sentences(f, *, ignore_punct=False, punct_tags=None, warnings=True):
+def read_tsv_sentences(f, *, warnings=True):
     """Read a tab-separated file with six columns: word index, word,
     part-of-speech tag, index of dependency head, dependency relation,
     phrase structure tree. There must be an empty line after each
@@ -23,11 +23,7 @@ def attributes(t):
         return {"word": t.word, "pos": t.pos}
 
     for sent_id, sentence in enumerate(_get_sentences(f)):
-        tokens = [t for t in sentence]
-        if ignore_punct:
-            tokens = [t for t in tokens if t.pos not in punct_tags]
-        forms = [t.word for t in tokens]
-        tokens = [Token(t.word, t.pos) for t in tokens]
+        tokens = [Token(t.word, t.pos) for t in sentence]
         if all((t.head != "_" for t in sentence)) and all((t.deprel != "_" for t in sentence)):
             g = networkx.DiGraph(sentence_id=sent_id)
             g.add_nodes_from([(i, attributes(t)) for i, t in enumerate(sentence)])
@@ -60,7 +56,7 @@ def attributes(t):
                 logging.warn("Failed to construct parse tree from sentence %s: %s" % (sent_id, tree_src))
                 tree = None
         if sensible and tree is not None:
-            yield forms, tokens, g, tree
+            yield tokens, g, tree
 
 
 def _get_sentences(f):