Skip to content

Commit

Permalink
some more refactoring in TreeTaggerWrapper, added an option in the de…
Browse files Browse the repository at this point in the history
…scriptor to supply a folder for the chinese tokenizer files, did an initial implementation for chinese word- and sentence-tokenization as well as POS tagging.
  • Loading branch information
jzell committed Sep 8, 2013
1 parent 1589ef8 commit d3b11fc
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 12 deletions.
9 changes: 9 additions & 0 deletions desc/annotator/TreeTaggerWrapper.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>

<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
Expand Down Expand Up @@ -39,6 +40,14 @@
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>ChineseTokenizerPath</name>
<description>Specifies the folder in which the chinese tokenizer scripts and data files reside. If left empty, this assumes $TREETAGGER_HOME/cmd/
For the tokenizer and TreeTagger parameter file, see here: http://corpus.leeds.ac.uk/tools/zh/</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
*/
package de.unihd.dbs.uima.annotator.treetagger;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;

/**
Expand Down Expand Up @@ -36,6 +39,9 @@ public class TreeTaggerProperties {
public String newLineSeparator = System.getProperty("line.separator");
public String fileSeparator = System.getProperty("file.separator");

// chinese tokenizer path
public File chineseTokenizerPath = null;

public Process getTokenizationProcess(File inputFile) throws IOException {
// assemble a command line for the tokenization script and execute it
ArrayList<String> command = new ArrayList<String>();
Expand All @@ -57,6 +63,48 @@ public Process getTokenizationProcess(File inputFile) throws IOException {
return p;
}

public Process getChineseTokenizationProcess() throws IOException {
// assemble a command line for the tokenization script and execute it
ArrayList<String> command = new ArrayList<String>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(this.chineseTokenizerPath, "segment-zh.pl"))));
String segmenterScript = "";
String buf = null;
Boolean firstLine = true;

// this dirty hack is to force the script to autoflush its buffers. thanks, PERL
while((buf = br.readLine()) != null) {
// set the lexicon files
if(buf.startsWith("$lexicon="))
buf = "$lexicon=\"" + new File(this.chineseTokenizerPath, "lcmc-uni2.dat").getAbsolutePath() + "\";";
if(buf.startsWith("$lexicon2="))
buf = "$lexicon2=\"" + new File(this.chineseTokenizerPath, "lcmc-bigrams2.dat").getAbsolutePath() + "\";";

// add the autoflush variable
if(firstLine) {
segmenterScript += "$| = 1;";
firstLine = false;
}

// we omit comments
if(!buf.startsWith("#"))
segmenterScript += buf;
}
br.close();

command.add("perl");
command.add("-X");
command.add("-e");
command.add(segmenterScript);

String[] commandStr = new String[command.size()];
command.toArray(commandStr);

ProcessBuilder builder = new ProcessBuilder(commandStr);
builder.directory(this.chineseTokenizerPath);

return builder.start();
}

public Process getTreeTaggingProcess(File inputFile) throws IOException {
// assemble a command line based on configuration and execute the POS tagging.
ArrayList<String> command = new ArrayList<String>();
Expand Down
109 changes: 97 additions & 12 deletions src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerWrapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
Expand Down Expand Up @@ -45,6 +46,7 @@ public class TreeTaggerWrapper extends JCasAnnotator_ImplBase {
public static final String PARAM_ANNOTATE_SENTENCES = "annotate_sentences";
public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech";
public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences";
public static final String PARAM_CHINESE_TOKENIZER_PATH = "ChineseTokenizerPath";

// language for this instance of the treetaggerwrapper
private Language language;
Expand All @@ -65,8 +67,16 @@ public class TreeTaggerWrapper extends JCasAnnotator_ImplBase {
*
*/
private class TreeTaggerContext extends RootUimaContext_impl {
// shorthand for when we don't want to supply a cnTokPath
@SuppressWarnings("unused")
public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences,
Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
this(language, annotateTokens, annotateSentences, annotatePartOfSpeech,
improveGermanSentences, null);
}

public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences,
Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) {
super();

// Initialize config
Expand All @@ -84,27 +94,38 @@ public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean anno
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_PARTOFSPEECH), annotatePartOfSpeech);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_IMPROVE_GERMAN_SENTENCES), improveGermanSentences);
configManager.setConfigParameterValue(makeQualifiedName(PARAM_CHINESE_TOKENIZER_PATH), cnTokPath);
}
}

/**
* secondary initialize() to use wrapper outside of a uima pipeline
* @param language
* @param treeTaggerHome
* @param annotateTokens
* @param annotateSentences
* @param annotatePartOfSpeech
* @param improveGermanSentences
* shorthand for when we don't want to specify a cnTokPath
*/
public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens,
Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) {
this.initialize(language, treeTaggerHome, annotateTokens, annotateSentences, annotatePartOfSpeech,
improveGermanSentences, null);
}

/**
* secondary initialize() to use wrapper outside of a uima pipeline
*
* @param language Language/parameter file to use for the TreeTagger
* @param treeTaggerHome Path to the TreeTagger folder
* @param annotateTokens Whether to annotate tokens
* @param annotateSentences Whether to annotate sentences
* @param annotatePartOfSpeech Whether to annotate POS tags
* @param improveGermanSentences Whether to do improvements for german sentences
*/
public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens,
Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) {
this.setHome(treeTaggerHome);

TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens,
annotateSentences, annotatePartOfSpeech, improveGermanSentences);
annotateSentences, annotatePartOfSpeech, improveGermanSentences, cnTokPath);

this.initialize(ttContext);

}

/**
Expand All @@ -119,6 +140,7 @@ public void initialize(UimaContext aContext) {
annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
improve_german_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_IMPROVE_GERMAN_SENTENCES);
String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH);

// set some configuration based upon these values
ttprops.languageName = language.getTreeTaggerLangName();
Expand All @@ -128,6 +150,10 @@ public void initialize(UimaContext aContext) {
ttprops.parFileName = ttprops.languageName + ".par";
ttprops.abbFileName = ttprops.languageName + "-abbreviations";
ttprops.languageSwitch = language.getTreeTaggerSwitch();
if(cnTokPath != null && !cnTokPath.equals(""))
ttprops.chineseTokenizerPath = new File(cnTokPath);
else
ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd");

// take utf-8 parameter files where available
if (language.equals(Language.GERMAN) && !ttprops.utf8Switch.equals("")) {
Expand Down Expand Up @@ -158,13 +184,19 @@ public void initialize(UimaContext aContext) {
File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib",ttprops.parFileName);
File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd",ttprops.tokScriptName);
if (!(abbFileFlag = abbFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
if(language.equals(Language.CHINESE))
abbFileFlag = true;
else
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
}
if (!(parFileFlag = parFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName);
}
if (!(tokScriptFlag = tokFile.exists())) {
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
if(language.equals(Language.CHINESE))
tokScriptFlag = true;
else
Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
}

if (!abbFileFlag || !parFileFlag || !tokScriptFlag) {
Expand All @@ -188,7 +220,10 @@ public void initialize(UimaContext aContext) {
public void process(JCas jcas) throws AnalysisEngineProcessException {
// if the annotate_tokens flag is set, annotate the tokens and add them to the jcas
if(annotate_tokens)
tokenize(jcas);
if(language.equals(Language.CHINESE))
tokenizeChinese(jcas); // chinese needs different tokenization
else
tokenize(jcas);

/* if the annotate_partofspeech flag is set, annotate partofspeech and,
* if specified, also tag sentences based upon the partofspeech tags.
Expand Down Expand Up @@ -270,9 +305,58 @@ private void tokenize(JCas jcas) {
}
}
}

}

/**
* tokenizes a given JCas object's document text using the chinese tokenization
* script and adds the recognized tokens to the JCas object.
* @param jcas JCas object supplied by the pipeline
*/
private void tokenizeChinese(JCas jcas) {
try {
// read tokenized text to add tokens to the jcas
Process proc = ttprops.getChineseTokenizationProcess();
Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath);

BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
BufferedWriter out = null;

int tokenOffset = 0;
// loop through all the lines in the stdout output
String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+");
for(String inSplit : inSplits) {
out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8"));
out.write(inSplit);
out.newLine();
out.flush();

String s;
while((s = in.readLine()) != null && in.ready()) {
String[] outSplits = s.split("\\s+");
for(String tok : outSplits) {
if(jcas.getDocumentText().indexOf(tok, tokenOffset) < 0)
throw new RuntimeException("Could not find token " + tok +
" in JCas after tokenizing with Chinese tokenization script.");

// create tokens and add them to the jcas's indexes.
Token newToken = new Token(jcas);
newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset));
newToken.setEnd(newToken.getBegin() + tok.length());
newToken.addToIndexes();
tokenOffset = newToken.getEnd();
}
}
}

// clean up
in.close();
proc.destroy();
} catch (Exception e) {
e.printStackTrace();
}
}


/**
* based on tokens from the jcas object, adds part of speech (POS) and sentence
* tags to the jcas object using the treetagger program.
Expand Down Expand Up @@ -311,6 +395,7 @@ private void doTreeTag(JCas jcas) {
hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
hsEndOfSentenceTag.add("ew"); // CHINESE

try {
Process p = ttprops.getTreeTaggingProcess(tmpDocument);
Expand Down

0 comments on commit d3b11fc

Please sign in to comment.