Release 1.4.5; Document profiling/annotating

GrammarSoft · Jun 1, 2023 · 4c92ab6 · 4c92ab6
1 parent 24840f1
commit 4c92ab6
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 15 deletions.
diff --git a/manual/binarygrammar.xml b/manual/binarygrammar.xml
@@ -68,10 +68,10 @@
     </section>
 
     <section id="binarystatistics">
-      <title>--grammar-info, --grammar-out, --statistics</title>
+      <title>--grammar-info, --grammar-out, --profile</title>
       <para>
         Since binary grammars cannot be written back out in textual form, the command line options
-        --grammar-info, --grammar-out, and --statistics will not work in binary mode.
+        --grammar-info, --grammar-out, and --profile will not work in binary mode.
       </para>
     </section>
   </section>

diff --git a/manual/cmdreference.xml b/manual/cmdreference.xml
@@ -34,6 +34,8 @@ Options:
  -s, --sections             number or ranges of sections to run; defaults to all sections
      --rules                number or ranges of rules to run; defaults to all rules
      --rule                 a name or number of a single rule to run
+     --nrules               a regex for which rule names to parse/run; defaults to all rules
+     --nrules-v             a regex for which rule names not to parse/run
  -d, --debug                enables debug output (very noisy)
  -v, --verbose              increases verbosity
      --quiet                squelches warnings (same as -v 0)
@@ -54,9 +56,7 @@ Options:
      --dry-run              make no actual changes to the input
      --single-run           runs each section only once; same as --max-runs 1
      --max-runs             runs each section max N times; defaults to unlimited (0)
- -S, --statistics           gathers profiling statistics while applying grammar
- -Z, --optimize-unsafe      destructively optimize the profiled grammar to be faster
- -z, --optimize-safe        conservatively optimize the profiled grammar to be faster
+     --profile              gathers profiling statistics and code coverage into a SQLite database
  -p, --prefix               sets the mapping prefix; defaults to @
      --unicode-tags         outputs Unicode code points for things like -&gt;
      --unique-tags          outputs unique tags only once per reading
@@ -112,6 +112,7 @@ Options:
      --add-tags      adds minimal analysis to readings (implies -x)
  -C, --out-cg        sets output format to CG (default)
  -A, --out-apertium  sets output format to Apertium
+ -F, --out-fst       sets output format to HFST/XFST
  -M, --out-matxin    sets output format to Matxin
  -N, --out-niceline  sets output format to Niceline CG
  -X, --out-plain     sets output format to plain text
@@ -121,6 +122,8 @@ Options:
  -r, --rtl           sets sub-reading direction to RTL (default)
  -l, --ltr           sets sub-reading direction to LTR
  -o, --ordered       tag order matters mode
+ -D, --parse-dep     parse dependency (defaults to treating as normal tags)
+     --unicode-tags  outputs Unicode code points for things like -&gt;
      --deleted       read deleted readings as such, instead of as text
  -B, --no-break      inhibits any extra whitespace in output
     </screen>
@@ -147,17 +150,24 @@ USAGE: cg-comp grammar_file output_file
       Made for the Apertium toolchain.
     </para>
     <screen>
-USAGE: cg-proc [-t] [-s] [-d] grammar_file [input_file [output_file]]
+USAGE: cg-proc [-t] [-s] [-d] [-g] [-r rule] grammar_file [input_file [output_file]]
 
 Options:
- -d, --disambiguation:    morphological disambiguation
- -s, --sections=NUM:      specify number of sections to process
- -f, --stream-format=NUM: set the format of the I/O stream to NUM,
-                            where `0' is VISL format and `1' is
-                            Apertium format (default: 1)
- -t, --trace:             print debug output on stderr
- -v, --version:           version
- -h, --help:              show this help
+        -d:      morphological disambiguation (default behaviour)
+        -s:      specify number of sections to process
+        -f:      set the format of the I/O stream to NUM,
+                   where `0' is VISL format, `1' is
+                   Apertium format and `2' is Matxin (default: 1)
+        -r:      run only the named rule
+        -t:      print debug output on stderr
+        -w:      enforce surface case on lemma/baseform
+                   (to work with -w option of lt-proc)
+        -n:      do not print out the word form of each cohort
+        -g:      do not surround lexical units in ^$
+        -1:      only output the first analysis if ambiguity remains
+        -z:      flush output on the null character
+        -v:      version
+        -h:      show this help
     </screen>
   </section>
 

diff --git a/manual/manual.xml b/manual/manual.xml
@@ -53,6 +53,7 @@
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="sets.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="tags.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="subreadings.xml" />
+  <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="profiling.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="binarygrammar.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="externals.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="streamcmds.xml" />

diff --git a/manual/profiling.xml b/manual/profiling.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE chapter SYSTEM "docbook-dtd-45/docbookx.dtd">
+
+<chapter id="profiling">
+  <title>Profiling / Code Coverage</title>
+
+  <section id="prof-what-why">
+    <title>What and why</title>
+    <para>
+      Grammars tend to accumulate rules and conditions over time, as exceptions and corner cases are discovered. But these are very rarely removed again, since they may still be useful but nobody knows if they really are. These tools aim to solve that problem, by letting you test a grammar against a large corpus and see exactly what rules and contexts are used, how often they are used (or not), and examples of contexts in which they are used.
+    </para>
+  </section>
+
+  <section id="prof-gather">
+    <title>Gathering profiling data</title>
+    <para>
+      When running a corpus through a grammar, the extra cmdline flag <code>--profile data.sqlite</code> will gather code coverage and data for hits and misses for every rule and condition into an SQLite 3 database. Each run must use its own database, but they can subsequently be merged with <code>cg-merge-annotations output.sqlite input-one.sqlite input-two.sqlite input-three.sqlite ...</code>.
+    </para>
+  </section>
+
+  <section id="prof-annotate">
+    <title>Annotating</title>
+    <para>
+      Use <code>cg-annotate data.sqlite /path/to/output</code> to render the gathered data as HTML. This will create a <code>/path/to/output/index.html</code> file that you can open in a browser, alongside files with hit examples for each rule and context.
+    </para>
+    <para>
+      In case of included grammars, each grammar is rendered separately. And in each rendering, the rules and conditions that matched are clickable to go to a page where an example context is shown. The example has <code># RULE TARGET BEGIN</code> and <code># RULE TARGET END</code> to mark exactly what cohort triggered the rule/condition.
+    </para>
+  </section>
+</chapter>
diff --git a/newsletters/2023-06-01.txt b/newsletters/2023-06-01.txt
@@ -0,0 +1,55 @@
+A new release of CG-3 has been tagged v1.4.5 (binary rev 13897).
+
+Haven't made done one of these rundowns since last NoDaLiDa workshop in 2019, and didn't quite make it in time for last week's NoDaLiDa.
+
+Authoritative repository is now on Github: https://github.com/GrammarSoft/cg3
+
+Notable new features:
+- Nested rules keyword With implemented by Daniel Glen Swanson. See https://visl.sdu.dk/cg3/chunked/rules.html#with
+- Implemented code coverage / profiling to find annotated examples. See https://visl.sdu.dk/cg3/chunked/profiling.html
+
+New features:
+- Added rule flags NoMapped and NoParent which will cause the rule to skip mapped readings or cohorts with a dependency parent.
+- Cmdline flag --dep-absolute will cause dependency to be written with globally unique cohort IDs
+- Added rule flags Ignored that will make Remove hide away readings for the current grammar. See https://visl.sdu.dk/cg3/chunked/rules.html#rule-options-ignored
+- RemCohort can take Ignored to hide away whole cohorts. And Ignored WithChild to hide away whole dependency sub-trees.
+- Added rule flag LookIgnored and context modifier 'I' to allow rules and contexts to look at ignored readings.
+- Added rule type Restore to revive previously deleted/ignored readings. See https://visl.sdu.dk/cg3/chunked/rules.html#restore
+- Section headers can now have rule flags, which will then apply to all rules in that section.
+- Cmdline flag -B will inhibit and trim whitespace between/after cohorts.
+- Cmdline flag -T will delimit based on a regex of non-CG data. Defaults to /(^|\n)<s/. See also https://visl.sdu.dk/cg3/chunked/cgkeywords.html#keyword-text-delimiters
+- Environment variables CG3_DEFAULT and CG3_OVERRIDE can set and override CG-3 cmdline parameters. Ditto CG3_CONV_DEFAULT and CG3_CONV_OVERRIDE for cg-conv.
+- Added context modifier 't' to look at non-target readings, and 'T' to only look at target readings. See https://visl.sdu.dk/cg3/chunked/contexts.html#test-active
+- Added global option addcohort-attach to make all AddCohort rules automatically attach to the nearest neighbour. See https://visl.sdu.dk/cg3/chunked/grammar.html#grammar-options
+- cg-sort can now sort by weight (-w), reverse (-r), and keep only the first reading (-1).
+- cg-conv can now convert back to FST format with -F.
+- List += can append tags to an existing set.
+- New directive Undef-Sets to delete sets and allow their redefinition. Mostly used when including a common grammar that you want to make a few exceptions to.
+- Implemented window-local stream variables. See https://visl.sdu.dk/cg3/chunked/tags.html#local-variables
+- Cmdline flag --nrules and --nrules-v to filter which named rules to include in the parse.
+- Tag type line match to match the literal whole reading line. See https://visl.sdu.dk/cg3/chunked/tags.html#line-match
+
+Changes:
+- Jump targets can now be constructed from unification and varstrings.
+- Relation queries can now be constructed from varstrings.
+- Relations now exist as tags during the run so they can be captured with regex.
+- Relation queries themselves can also be captured with regex.
+- Binary grammars should now be reproducible.
+- Baseforms may now be empty strings.
+- SetVariable/RemVariable now allow varstrings for variable names and values.
+- Stream variables can now have their values tested by equality and regex. See https://visl.sdu.dk/cg3/chunked/tags.html#global-variables
+- Lots of updates and new features to the Emacs mode by Kevin Brubeck Unhammer.
+- On Posix platforms, Include paths are now shell-expanded so tilde and environment variables can be used.
+- Codebase now requires C++17
+
+Fixed Bugs:
+- Shorthand @< and @> will now fail if there is no previous/next window to look at.
+
+Main site is https://visl.sdu.dk/cg3.html
+Google Group is https://groups.google.com/group/constraint-grammar
+Source is at https://github.com/GrammarSoft/cg3
+OS X binaries are at https://apertium.projectjj.com/osx/
+RHEL/Fedora/CentOS/OpenSUSE packages are at https://apertium.projectjj.com/rpm/howto.txt
+Debian/Ubuntu packages are at https://apertium.projectjj.com/apt/howto.txt
+
+-- Tino Didriksen
diff --git a/src/version.hpp b/src/version.hpp
@@ -27,7 +27,7 @@ constexpr auto CG3_COPYRIGHT_STRING = "Copyright (C) 2007-2023 GrammarSoft ApS.
 
 constexpr uint32_t CG3_VERSION_MAJOR = 1;
 constexpr uint32_t CG3_VERSION_MINOR = 4;
-constexpr uint32_t CG3_VERSION_PATCH = 4;
+constexpr uint32_t CG3_VERSION_PATCH = 5;
 constexpr uint32_t CG3_REVISION = 13897;
 constexpr uint32_t CG3_FEATURE_REV = 13897;
 constexpr uint32_t CG3_TOO_OLD = 10373;