Skip to content

Commit

Permalink
Add tags to Text object, move ignore_punct out of reader functions
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Dec 9, 2021
1 parent e1b06ca commit da77021
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 25 deletions.
16 changes: 9 additions & 7 deletions textcomplexity/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,20 @@ def main():
punct_tags = set(punct_tags)
all_results = {}
for i, f in enumerate(args.TEXT):
tokens, tagged, graphs, ps_trees = None, None, None, None
tokens, sentences, graphs, ps_trees = None, None, None, None
if args.input_format == "conllu":
tokens, tagged, graphs = zip(*conllu.read_conllu_sentences(f, ignore_punct=args.ignore_punct, punct_tags=punct_tags))
tokens = list(itertools.chain.from_iterable(tokens))
sentences, graphs = zip(*conllu.read_conllu_sentences(f))
tokens = list(itertools.chain.from_iterable(sentences))
elif args.input_format == "tsv":
tokens, tagged, graphs, ps_trees = zip(*custom_tsv.read_tsv_sentences(f, ignore_punct=args.ignore_punct, punct_tags=punct_tags))
tokens = list(itertools.chain.from_iterable(tokens))
sentences, graphs, ps_trees = zip(*custom_tsv.read_tsv_sentences(f))
tokens = list(itertools.chain.from_iterable(sentences))
if args.ignore_punct and tokens is not None:
tokens = [t for t in tokens if t.pos not in punct_tags]
results = []
if args.sur and tokens is not None:
results.extend(surface_based(tokens, args.window_size, args.all_measures))
if args.sent and tagged is not None:
results.extend(sentence_based(tagged, punct_tags))
if args.sent and sentences is not None:
results.extend(sentence_based(sentences, punct_tags))
if args.dep and graphs is not None:
results.extend(dependency_based(graphs))
if args.const and ps_trees is not None:
Expand Down
14 changes: 10 additions & 4 deletions textcomplexity/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,24 @@

class Text:

def __init__(self, tokens, text_length, vocabulary_size, frequency_list, frequency_spectrum):
def __init__(self, tokens, tags, text_length, vocabulary_size, frequency_list, frequency_spectrum):
self.tokens = tokens
self.tags = tags
self.text_length = text_length
self.vocabulary_size = vocabulary_size
self.frequency_list = frequency_list
self.frequency_spectrum = frequency_spectrum

@classmethod
def from_tokens(cls, tokens):
"""Create Text object from iterable of tokens."""
"""Create Text object from iterable of tokens, i.e. named tuples
(word, pos).
"""
toks = [t.word for t in tokens]
tags = [t.pos for t in tokens]
text_length = len(tokens)
frequency_list = collections.Counter(tokens)
frequency_list = collections.Counter(toks)
vocabulary_size = len(frequency_list)
frequency_spectrum = dict(collections.Counter(frequency_list.values()))
return cls(tokens, text_length, vocabulary_size, frequency_list, frequency_spectrum)
return cls(toks, tags, text_length, vocabulary_size, frequency_list, frequency_spectrum)
11 changes: 4 additions & 7 deletions textcomplexity/utils/conllu.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,17 @@
from textcomplexity.utils import graph

UdToken = collections.namedtuple("UdToken", "id form lemma upos xpos feats head deprel deps misc".split())
Token = collections.namedtuple("Token", "word pos".split())
Token = collections.namedtuple("Token", "word pos upos".split())


def read_conllu_sentences(f, *, ignore_punct=False, punct_tags=None, warnings=True):
def read_conllu_sentences(f, *, warnings=True):
for sentence, sent_id in _read_conllu(f):
tokens = _get_tokens(sentence)
if ignore_punct:
tokens = [t for t in tokens if t.upos not in punct_tags]
forms = [t.form for t in tokens]
tokens = [Token(t.form, t.upos) for t in tokens]
tokens = [Token(t.form, t.xpos, t.upos) for t in tokens]
g = _create_nx_digraph(sentence, sent_id)
sensible, explanation = graph.is_sensible_graph(g)
if sensible:
yield forms, tokens, g
yield tokens, g
else:
if warnings:
logging.warn("Ignoring sentence with ID %s: %s" % (sent_id, explanation))
Expand Down
10 changes: 3 additions & 7 deletions textcomplexity/utils/custom_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Token = collections.namedtuple("Token", "word pos".split())


def read_tsv_sentences(f, *, ignore_punct=False, punct_tags=None, warnings=True):
def read_tsv_sentences(f, *, warnings=True):
"""Read a tab-separated file with six columns: word index, word,
part-of-speech tag, index of dependency head, dependency relation,
phrase structure tree. There must be an empty line after each
Expand All @@ -23,11 +23,7 @@ def attributes(t):
return {"word": t.word, "pos": t.pos}

for sent_id, sentence in enumerate(_get_sentences(f)):
tokens = [t for t in sentence]
if ignore_punct:
tokens = [t for t in tokens if t.pos not in punct_tags]
forms = [t.word for t in tokens]
tokens = [Token(t.word, t.pos) for t in tokens]
tokens = [Token(t.word, t.pos) for t in sentence]
if all((t.head != "_" for t in sentence)) and all((t.deprel != "_" for t in sentence)):
g = networkx.DiGraph(sentence_id=sent_id)
g.add_nodes_from([(i, attributes(t)) for i, t in enumerate(sentence)])
Expand Down Expand Up @@ -60,7 +56,7 @@ def attributes(t):
logging.warn("Failed to construct parse tree from sentence %s: %s" % (sent_id, tree_src))
tree = None
if sensible and tree is not None:
yield forms, tokens, g, tree
yield tokens, g, tree


def _get_sentences(f):
Expand Down

0 comments on commit da77021

Please sign in to comment.