add new lemma map

cltk · May 4, 2015 · d2c7c14 · d2c7c14
1 parent 94a32c2
commit d2c7c14
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 8 deletions.
diff --git a/greek_lemmata_cltk.py b/greek_lemmata_cltk.py
diff --git a/transform_lemmata.py b/transform_lemmata.py
@@ -1,5 +1,7 @@
 from cltk.corpus.greek.beta_to_unicode import Replacer
-from collections import  defaultdict
+from collections import Counter
+from collections import defaultdict
+import operator
 
 replacer = Replacer()
 
@@ -11,12 +13,24 @@ def file_line_generator(file):
             yield file_line[:-1]  # remove '\n' from end of each line
 
 
+def iter_headwords(def_dict):
+    for inflection, headwords in def_dict.items():
+        for headword in list(headwords):
+            yield headword
+
+
+def make_headword_count(def_dict):
+    headwords = iter_headwords(def_dict)
+    return Counter(headwords)
+
+
+
 def parse_perseus_lemmata_file(file_generator, greek):
     """Parse lemmata file, looping through string for all data."""
     count = 0
     for line in file_generator:
         count += 1
-        if count % 1000 == 0:
+        if count % 10000 == 0:
             print('Parsing line {0}'.format(count))
         line_split = line.split('\t')
         headword = line_split[0]
@@ -55,15 +69,34 @@ def parse_perseus_lemmata_file(file_generator, greek):
 
 if __name__ == '__main__':
     lemma_headword_map = {}
-    greek_file_generator = file_line_generator('greek-lemmata.txt')
-    greek_lemma_headword = parse_perseus_lemmata_file(greek_file_generator, greek=False)
+    file_generator = file_line_generator('greek-lemmata.txt')
+    lemma_headword = parse_perseus_lemmata_file(file_generator, greek=True)
 
     print('Starting to build map …')
     lemmata_dd = defaultdict(set)
-    for k, v in greek_lemma_headword:
+    for k, v in lemma_headword:
         lemmata_dd[k].add(v)
 
-    print('Starting to write file …')
-    with open('greek_lemmata_cltk.txt', 'w') as file_opened:
-        file_opened.write(str(dict(lemmata_dd)))
+    print('Building headword frequencies …')
+    headword_frequencies = make_headword_count(lemmata_dd)
+
+    print('Building final lemma-headword dict …')
+    # for any lemma with more than one possible headword
+    # check each for which occurs most
+    final_lemmata = {}
+    for k, v in lemmata_dd.items():
+        if len(list(v)) > 1:
+            count_dict = {}
+            for curr_hw in list(v):
+                curr_count = headword_frequencies[curr_hw]
+                count_dict[curr_hw] = curr_count
+            # http://stackoverflow.com/a/268285
+            # if tie then takes one
+            top_headword = max(count_dict.items(), key=operator.itemgetter(1))[0]
+            final_lemmata[k] = top_headword
+        else:
+            final_lemmata[k] = list(v)[0]
 
+    print('Starting to write file …')
+    with open('greek_lemmata_cltk.py', 'w') as file_opened:
+        file_opened.write('LEMMATA = ' + str(dict(final_lemmata)))