Skip to content

Commit

Permalink
Added n-best as a benchmarking option
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuellegedin committed Jul 28, 2016
1 parent 2d268fb commit 2538457
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ public class Benchmark {

private final long count;

private final int nbest;

private Benchmark(Builder builder) {
this.tokenizer = builder.tokenizer;
this.inputFile = builder.inputFile;
Expand All @@ -72,6 +74,7 @@ private Benchmark(Builder builder) {
this.validationFile = builder.validationFile;
this.outputStatistics = builder.outputStatistics;
this.count = builder.count;
this.nbest = builder.nbest;
}

public void benchmark() throws IOException {
Expand Down Expand Up @@ -145,7 +148,9 @@ public void tokenize(BufferedReader reader, Writer writer, Writer statisticsWrit

private void tokenizeDocument(Writer writer, String text) throws IOException {
List<? extends TokenBase> tokens = tokenizer.tokenize(text);
List<List<TokenBase>> multiTokens = tokenizer.multiTokenize(text, 10, 100000);
if (nbest > 1) {
List<List<TokenBase>> multiTokens = tokenizer.multiTokenizeNBest(text, nbest);
}

updateStatistics(text, tokens);

Expand Down Expand Up @@ -254,6 +259,8 @@ public static class Builder {

private long count = 0;

private int nbest = 1;

public Builder tokenizer(TokenizerBase tokenizer) {
this.tokenizer = tokenizer;
return this;
Expand Down Expand Up @@ -294,6 +301,11 @@ public Builder count(long count) {
return this;
}

public Builder nbest(int nbest) {
this.nbest = nbest;
return this;
}

public Benchmark build() {
return new Benchmark(this);
}
Expand All @@ -307,6 +319,7 @@ public static void main(String[] args) throws IOException {
options.addOption("c", "count", true, "Number of documents ot process (Default: 0, which means all");
// options.addOption("v", "validation-input", true, "Validation filename");
options.addOption("o", "output", true, "Output filename. If unset, segmentation is done, but the result is discarded");
options.addOption("n", "n-best", true, "The number of tokenizations to get per input");
options.addOption(null, "benchmark-output", true, "Benchmark metrics output filename filename");

CommandLineParser parser = new DefaultParser();
Expand Down Expand Up @@ -376,13 +389,18 @@ public static void main(String[] args) throws IOException {
commandLine.getOptionValue("c", "0")
);

int nbest = Integer.parseInt(
commandLine.getOptionValue("n", "1")
);

Benchmark benchmark = new Builder()
.tokenizer(tokenizer)
.inputFile(new File(inputFilename))
.outputFile(outputFile)
.outputStatisticsFile(statisticsFile)
.setOutputStatistiscs(true)
.count(count)
.nbest(nbest)
.build();

benchmark.benchmark();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ public void benchmark(String tokenizerClass, String tokenizerName, String userDi
args.add(tokenizerClass);
args.add("-c");
args.add("3000");
args.add("-n");
args.add("10");
args.add("--benchmark-output");
args.add("jawiki-" + tokenizerName + "-benchmark.tsv");
args.add("../kuromoji-benchmark/jawiki/jawiki.tsv.gz");
Expand All @@ -92,6 +94,7 @@ public void tokenize(String inputFilename,
Benchmark.main(new String[]{
"-t", tokenizerClass,
"-o", outputFilename,
"-n", "10",
inputFilename
});
}
Expand All @@ -103,6 +106,7 @@ public void tokenizeUserDictionary(String inputFilename,
Benchmark.main(new String[]{
"-t", tokenizerClass,
"-o", outputFilename,
"-n", "10",
"-u", userDictionaryFilename,
inputFilename
});
Expand Down

0 comments on commit 2538457

Please sign in to comment.