-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge with 8358b3eb6e5b5237ade42438836b869e6141a776
- Loading branch information
Showing
75 changed files
with
3,072 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier"> | ||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation> | ||
<primitive>true</primitive> | ||
<annotatorImplementationName>de.unihd.dbs.uima.annotator.stanfordtagger.StanfordPOSTaggerWrapper</annotatorImplementationName> | ||
<analysisEngineMetaData> | ||
<name>StanfordPOSTaggerWrapper</name> | ||
<description/> | ||
<version>1.0</version> | ||
<vendor>UniHD DBS</vendor> | ||
<configurationParameters searchStrategy="language_fallback"> | ||
<configurationParameter> | ||
<name>model_path</name> | ||
<description>Path to a model for use with the Stanford POS Tagger</description> | ||
<type>String</type> | ||
<multiValued>false</multiValued> | ||
<mandatory>true</mandatory> | ||
</configurationParameter> | ||
<configurationParameter> | ||
<name>config_path</name> | ||
<description>Optional path to a configuration for use with the Stanford POS Tagger</description> | ||
<type>String</type> | ||
<multiValued>false</multiValued> | ||
<mandatory>false</mandatory> | ||
</configurationParameter> | ||
<configurationParameter> | ||
<name>annotate_tokens</name> | ||
<description>Whether or not to annotate tokens</description> | ||
<type>Boolean</type> | ||
<multiValued>false</multiValued> | ||
<mandatory>false</mandatory> | ||
</configurationParameter> | ||
<configurationParameter> | ||
<name>annotate_sentences</name> | ||
<description>Whether or not to annotate sentences.</description> | ||
<type>Boolean</type> | ||
<multiValued>false</multiValued> | ||
<mandatory>false</mandatory> | ||
</configurationParameter> | ||
<configurationParameter> | ||
<name>annotate_partofspeech</name> | ||
<description>Whether or not to annotate part of speech (POS) information</description> | ||
<type>Boolean</type> | ||
<multiValued>false</multiValued> | ||
<mandatory>false</mandatory> | ||
</configurationParameter> | ||
</configurationParameters> | ||
<configurationParameterSettings> | ||
<nameValuePair> | ||
<name>annotate_tokens</name> | ||
<value> | ||
<boolean>true</boolean> | ||
</value> | ||
</nameValuePair> | ||
<nameValuePair> | ||
<name>annotate_sentences</name> | ||
<value> | ||
<boolean>true</boolean> | ||
</value> | ||
</nameValuePair> | ||
<nameValuePair> | ||
<name>annotate_partofspeech</name> | ||
<value> | ||
<boolean>true</boolean> | ||
</value> | ||
</nameValuePair> | ||
<nameValuePair> | ||
<name>model_path</name> | ||
<value> | ||
<string>/opt/stanford-postagger-full/models/english-bidirectional-distsim.tagger</string> | ||
</value> | ||
</nameValuePair> | ||
</configurationParameterSettings> | ||
<typeSystemDescription> | ||
<imports> | ||
<import location="../type/HeidelTime_TypeSystem.xml"/> | ||
</imports> | ||
</typeSystemDescription> | ||
<typePriorities/> | ||
<fsIndexCollection/> | ||
<capabilities> | ||
<capability> | ||
<inputs/> | ||
<outputs/> | ||
<languagesSupported/> | ||
</capability> | ||
</capabilities> | ||
<operationalProperties> | ||
<modifiesCas>true</modifiesCas> | ||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed> | ||
<outputsNewCASes>false</outputsNewCASes> | ||
</operationalProperties> | ||
</analysisEngineMetaData> | ||
<resourceManagerConfiguration/> | ||
</analysisEngineDescription> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
resources/arabic/normalization/resources_normalization_normApprox4Dates.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// author: Jannik Strötgen | ||
// email: [email protected] | ||
// date: 2011-09-12 | ||
// This file contains "approximate words" and their normalized expressions | ||
// for dates/times according to TIMEX3 format. | ||
// For example, the normalized value of "about" is "APPROX" | ||
// FORMAT: "approximate-word","normalized-approximate-word" | ||
// about | ||
"حوالي","APPROX" | ||
"نحو","APPROX" | ||
"تقريبا[ً]?","APPROX" | ||
"حول","APPROX"S | ||
"حتى","EQUAL_OR_LESS" | ||
"على ال[اأ]قل","EQUAL_OR_MORE" | ||
"[اأ]كثر من","MORE_THAN" | ||
"[اأ]قل من","LESS_THAN" | ||
"[اأ]طول من","MORE_THAN" | ||
"[اأ]قصر من","LESS_THAN" | ||
"في غضون","APPROX" | ||
"قرابة","APPROX" |
20 changes: 20 additions & 0 deletions
20
resources/arabic/normalization/resources_normalization_normApprox4Durations.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// author: Jannik Strötgen | ||
// email: [email protected] | ||
// date: 2011-09-12 | ||
// This file contains "approximate words" and their normalized expressions | ||
// for dates/times according to TIMEX3 format. | ||
// For example, the normalized value of "about" is "APPROX" | ||
// FORMAT: "approximate-word","normalized-approximate-word" | ||
"حوالي","APPROX" | ||
"نحو","APPROX" | ||
"تقريبا[ً]?","APPROX" | ||
"حول","APPROX" | ||
"حتى","EQUAL_OR_LESS" | ||
"على ال[اأ]قل","EQUAL_OR_MORE" | ||
"[اأ]كثر من","MORE_THAN" | ||
"[اأ]قل من","LESS_THAN" | ||
"[اأ]طول من","MORE_THAN" | ||
"[اأ]قصر من","LESS_THAN" | ||
"في غضون","APPROX" | ||
"قرابة","APPROX" | ||
|
17 changes: 17 additions & 0 deletions
17
resources/arabic/normalization/resources_normalization_normArabicDigit.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// author: Jannik Strötgen | ||
// email: [email protected] | ||
// date: 2011-06-10 | ||
// This file contains "duration numbers" and their normalized expressions | ||
// according to TIMEX3 format. | ||
// For example, the normalized value of "one" is "1" | ||
// FORMAT: "duration-number","normalized-duration-number" | ||
"\u0660","0" | ||
"\u0661","1" | ||
"\u0662","2" | ||
"\u0663","3" | ||
"\u0664","4" | ||
"\u0665","5" | ||
"\u0666","6" | ||
"\u0667","7" | ||
"\u0668","8" | ||
"\u0669","9" |
123 changes: 123 additions & 0 deletions
123
resources/arabic/normalization/resources_normalization_normDateNumber.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// author: Jannik Strötgen | ||
// email: [email protected] | ||
// date: 2011-06-10 | ||
// This file contains "duration numbers" and their normalized expressions | ||
// according to TIMEX3 format. | ||
// For example, the normalized value of "one" is "1" | ||
// FORMAT: "duration-number","normalized-duration-number" | ||
"واحد[هة]?","01" | ||
"[اإ]ثن[ت]?[اي]ن","02" | ||
"ثلاث[هة]?","03" | ||
"[اأ]ربع[هة]?","04" | ||
"خمس[هة]?","05" | ||
"ست[هة]?","06" | ||
"سبع[هة]?","07" | ||
"ثماني[هة]?","08" | ||
"تسع[هة]?","09" | ||
"عشر[هة]?","10" | ||
"[أاإ]حد[ى]?[\s]?عشر[هة]?","11" | ||
"[اإ]ثن[ت]?[اي][\s]?عشر[هة]?","12" | ||
"ثلاث[هة]?[\s]?عشر[هة]?","13" | ||
"[أا]ربع[هة]?[\s]?عشر[هة]?","14" | ||
"خمس[هة]?[\s]?عشر[هة]?","15" | ||
"ست[هة]?[\s]?عشر[هة]?","16" | ||
"سبع[هة]?[\s]?عشر[هة]?","17" | ||
"ثماني[هة]?[\s]?عشر[هة]?","18" | ||
"تسع[هة]?[\s]?عشر[هة]?","19" | ||
"عشر[وي]ن","20" | ||
"[اإ]حدى[\s]?و[\s]?عشر[وي]ن","21" | ||
"واحد[\s]?و[\s]?عشر[وي]ن","21" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?عشر[وي]ن","22" | ||
"ثلاث[هة]?[\s]?و[\s]?عشر[وي]ن","23" | ||
"[أا]ربع[هة]?[\s]?و[\s]?عشر[وي]ن","24" | ||
"خمس[هة]?[\s]?و[\s]?عشر[وي]ن","25" | ||
"ست[هة]?[\s]?و[\s]?عشر[وي]ن","26" | ||
"سبع[هة]?[\s]?و[\s]?عشر[وي]ن","27" | ||
"ثماني[هة]?[\s]?و[\s]?عشر[وي]ن","28" | ||
"تسع[هة]?[\s]?و[\s]?عشر[وي]ن","29" | ||
"ثلاث[وي]ن","30" | ||
"[اإ]حدى[\s]?و[\s]?ثلاث[وي]ن","31" | ||
"واحد[\s]?و[\s]?ثلاث[وي]ن","31" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?ثلاث[وي]ن","32" | ||
"ثلاث[هة]?[\s]?و[\s]?ثلاث[وي]ن","33" | ||
"[أا]ربع[هة]?[\s]?و[\s]?ثلاث[وي]ن","34" | ||
"خمس[هة]?[\s]?و[\s]?ثلاث[وي]ن","35" | ||
"ست[هة]?[\s]?و[\s]?ثلاث[وي]ن","36" | ||
"سبع[هة]?[\s]?و[\s]?ثلاث[وي]ن","37" | ||
"ثماني[هة]?[\s]?و[\s]?ثلاث[وي]ن","38" | ||
"تسع[هة]?[\s]?و[\s]?ثلاث[وي]ن","39" | ||
"[اأ]ربع[وي]ن","40" | ||
"[اإ]حدى[\s]?و[\s]?[اأ]ربع[وي]ن","41" | ||
"واحد[\s]?و[\s]?[اأ]ربع[وي]ن","41" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?[اأ]ربع[وي]ن","42" | ||
"ثلاث[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","43" | ||
"[أا]ربع[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","44" | ||
"خمس[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","45" | ||
"ست[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","46" | ||
"سبع[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","47" | ||
"ثماني[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","48" | ||
"تسع[هة]?[\s]?و[\s]?[اأ]ربع[وي]ن","49" | ||
"خمس[وي]ن","50" | ||
"[اإ]حدى[\s]?و[\s]?خمس[وي]ن","51" | ||
"واحد[\s]?و[\s]?خمس[وي]ن","51" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?خمس[وي]ن","52" | ||
"ثلاث[هة]?[\s]?و[\s]?خمس[وي]ن","53" | ||
"[أا]ربع[هة]?[\s]?و[\s]?خمس[وي]ن","54" | ||
"خمس[هة]?[\s]?و[\s]?خمس[وي]ن","55" | ||
"ست[هة]?[\s]?و[\s]?خمس[وي]ن","56" | ||
"سبع[هة]?[\s]?و[\s]?خمس[وي]ن","57" | ||
"ثماني[هة]?[\s]?و[\s]?خمس[وي]ن","58" | ||
"تسع[هة]?[\s]?و[\s]?خمس[وي]ن","59" | ||
"ست[وي]ن","60" | ||
"[اإ]حدى[\s]?و[\s]?ست[وي]ن","61" | ||
"واحد[\s]?و[\s]?ست[وي]ن","61" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?ست[وي]ن","62" | ||
"ثلاث[هة]?[\s]?و[\s]?ست[وي]ن","63" | ||
"[أا]ربع[هة]?[\s]?و[\s]?ست[وي]ن","64" | ||
"خمس[هة]?[\s]?و[\s]?ست[وي]ن","65" | ||
"ست[هة]?[\s]?و[\s]?ست[وي]ن","66" | ||
"سبع[هة]?[\s]?و[\s]?ست[وي]ن","67" | ||
"ثماني[هة]?[\s]?و[\s]?ست[وي]ن","68" | ||
"تسع[هة]?[\s]?و[\s]?ست[وي]ن","69" | ||
"سبع[وي]ن","70" | ||
"[اإ]حدى[\s]?و[\s]?سبع[وي]ن","71" | ||
"واحد[\s]?و[\s]?سبع[وي]ن","71" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?سبع[وي]ن","72" | ||
"ثلاث[هة]?[\s]?و[\s]?سبع[وي]ن","73" | ||
"[أا]ربع[هة]?[\s]?و[\s]?سبع[وي]ن","74" | ||
"خمس[هة]?[\s]?و[\s]?سبع[وي]ن","75" | ||
"ست[هة]?[\s]?و[\s]?سبع[وي]ن","76" | ||
"سبع[هة]?[\s]?و[\s]?سبع[وي]ن","77" | ||
"ثماني[هة]?[\s]?و[\s]?سبع[وي]ن","78" | ||
"تسع[هة]?[\s]?و[\s]?سبع[وي]ن","79" | ||
"ثمان[وي]ن","80" | ||
"[اإ]حدى[\s]?و[\s]?ثمان[وي]ن","81" | ||
"واحد[\s]?و[\s]?ثمان[وي]ن","81" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?ثمان[وي]ن","82" | ||
"ثلاث[هة]?[\s]?و[\s]?ثمان[وي]ن","83" | ||
"[أا]ربع[هة]?[\s]?و[\s]?ثمان[وي]ن","84" | ||
"خمس[هة]?[\s]?و[\s]?ثمان[وي]ن","85" | ||
"ست[هة]?[\s]?و[\s]?ثمان[وي]ن","86" | ||
"سبع[هة]?[\s]?و[\s]?ثمان[وي]ن","87" | ||
"ثماني[هة]?[\s]?و[\s]?ثمان[وي]ن","88" | ||
"تسع[هة]?[\s]?و[\s]?ثمان[وي]ن","89" | ||
"تسع[وي]ن","90" | ||
"[اإ]حدى[\s]?و[\s]?تسع[وي]ن","91" | ||
"واحد[\s]?و[\s]?تسع[وي]ن","91" | ||
"[اإ]ثن[ت]?[اي][ن]?[\s]?و[\s]?تسع[وي]ن","92" | ||
"ثلاث[هة]?[\s]?و[\s]?تسع[وي]ن","93" | ||
"[أا]ربع[هة]?[\s]?و[\s]?تسع[وي]ن","94" | ||
"خمس[هة]?[\s]?و[\s]?تسع[وي]ن","95" | ||
"ست[هة]?[\s]?و[\s]?تسع[وي]ن","96" | ||
"سبع[هة]?[\s]?و[\s]?تسع[وي]ن","97" | ||
"ثماني[هة]?[\s]?و[\s]?تسع[وي]ن","98" | ||
"تسع[هة]?[\s]?و[\s]?تسع[وي]ن","99" | ||
"م[ا]?ئ[هة]","1" | ||
"م[ا]?ئت[اي][ن]?","2" | ||
"ثلاث[هة]?[\s]?م[ا]?ئ[هة]","3" | ||
"[اأ]ربع[هة]?[\s]?م[ا]?ئ[هة],"4" | ||
"خمس[هة]?[\s]?م[ا]?ئ[هة]","5" | ||
"ست[هة]?[\s]?م[ا]?ئ[هة]","6" | ||
"سبع[هة]?[\s]?م[ا]?ئ[هة]","7" | ||
"ثماني[هة]?[\s]?م[ا]?ئ[هة]","8" | ||
"تسع[هة]?[\s]?م[ا]?ئ[هة]","9" |
32 changes: 32 additions & 0 deletions
32
resources/arabic/normalization/resources_normalization_normDateWord.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
// author: Jannik Strötgen | ||
// email: [email protected] | ||
// date: 2011-06-10 | ||
// This file contains "date words" and their normalized expressions | ||
// according to TIMEX3 format. | ||
// For example, the normalized value of "tomorrow" is "UNDEF-next-day" | ||
// FORMAT: "date-word","normalized-date-word" | ||
"[اأ]ول [اأ]مس","UNDEF-this-day-MINUS-2" | ||
"[اأ]لبارح[هة]","UNDEF-last-day" | ||
"(ب)?[اأ](لأ)?(لا)?مس","UNDEF-last-day" | ||
"[اأ]ليوم","UNDEF-this-day" | ||
"[اأ][ل]?غد","UNDEF-next-day" | ||
"غدا","UNDEF-next-day" | ||
"[اأ]ل[اآ]ن","PRESENT_REF" | ||
"في اللحظ[هة]","PRESENT_REF" | ||
"في [اأ]قرب وقت","FUTURE_REF" | ||
"قريبا[ً]?","FUTURE_REF" | ||
"حديثا[ً]?","PRESENT_REF" | ||
"حاليا[ً]?","PRESENT_REF" | ||
"سابقا[ً]?","PAST_REF" | ||
"في الماضي","PAST_REF" | ||
"في السابق","PAST_REF" | ||
"مؤخرا[ً]?","PAST_REF" | ||
"آنذاك","PAST_REF" | ||
"في الراهن","PRESENT_REF" | ||
"الوقت الراهن","PRESENT_REF" | ||
"الوقت الماضي","PAST_REF" | ||
"الوقت السابق","PAST_REF" | ||
"الوقت الحالي","PRESENT_REF" | ||
"هذه ال[اأ]يام","PRESENT_REF" | ||
"ال[اأ]يام ال[أا]خير[هة]","PAST_REF" | ||
|
Oops, something went wrong.