Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions for Tagging Schemes and Conversion. #161

Merged
merged 8 commits into from
Jun 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/src/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,32 @@ julia> summarize(s, ns=2)
"This has too foo sentences."
```

## Tagging_schemes

There are many tagging schemes used for sequence labelling.
TextAnalysis currently offers functions for conversion between these tagging format.

* BIO1
* BIO2
* BIOES

```julia
julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]

julia> tag_scheme!(tags, "BIO1", "BIOES")

julia> tags
8-element Array{String,1}:
"S-LOC"
"O"
"S-PER"
"B-MISC"
"E-MISC"
"B-PER"
"I-PER"
"E-PER"
```

## Parts of Speech Tagger

This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
Expand Down
2 changes: 2 additions & 0 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ module TextAnalysis
export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
export SentimentAnalyzer
export tag_scheme!
export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs
export rouge_l_summary, rouge_l_sentence, rouge_n
export PerceptronTagger, fit!, predict
Expand Down Expand Up @@ -78,6 +79,7 @@ module TextAnalysis
include("sentiment.jl")
include("bayes.jl")
include("deprecations.jl")
include("tagging_schemes.jl")
include("utils.jl")
include("rouge.jl")
include("averagePerceptronTagger.jl")
Expand Down
146 changes: 146 additions & 0 deletions src/tagging_schemes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Ref:
# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php

abstract type tag_scheme end

struct BIO1 <: tag_scheme end # BIO
struct BIO2 <: tag_scheme end
struct BIOES <: tag_scheme end

const available_schemes = ["BIO1", "BIO2", "BIOES"]

"""
tag_scheme!(tags, current_scheme::String, new_scheme::String)

Convert `tags` from `current_scheme` to `new_scheme`.

List of tagging schemes currently supported-
* BIO1 (BIO)
* BIO2
* BIOES

# Example
```julia-repl
julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]

julia> tag_scheme!(tags, "BIO1", "BIOES")

julia> tags
8-element Array{String,1}:
"S-LOC"
"O"
"S-PER"
"B-MISC"
"E-MISC"
"B-PER"
"I-PER"
"E-PER"
```
"""
function tag_scheme!(tags, current_scheme::String, new_scheme::String)
current_scheme = uppercase(current_scheme)
new_scheme = uppercase(new_scheme)
(length(tags) == 0 || current_scheme == new_scheme) && return

if new_scheme ∉ available_schemes || current_scheme ∉ available_schemes
error("Invalid tagging scheme")
end

current_scheme = eval(Symbol(current_scheme))()
new_scheme = eval(Symbol(new_scheme))()

tag_scheme!(tags, current_scheme, new_scheme)
end

function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == "O"
tags[i] = "O"
continue
end
(tags[i][1] == 'O' || tags[i][1] == 'B') && continue

if tags[i][1] == 'I'
if i == 1
tags[i] = 'B' * tags[i][2:end]
elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end]
tags[i] = 'B' * tags[i][2:end]
else
continue
end
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == "O"
tags[i] = "O"
continue
end
(tags[i][1] == 'O' || tags[i][1] == 'I') && continue

if tags[i][1] == 'B'
if i == length(tags)
tags[i] = 'I' * tags[i][2:end]
elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end]
tags[i] = 'I' * tags[i][2:end]
else
continue
end
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == 'O'
tags[i] = "O"
continue
end

if tags[i][1] == 'I' && (i == length(tags) ||
tags[i+1][2:end] != tags[i][2:end])
tags[i] = 'E' * tags[i][2:end]
elseif tags[i][1] == 'B' && (i == length(tags) ||
tags[i+1][2:end] != tags[i][2:end])
tags[i] = 'S' * tags[i][2:end]
else
(tags[i][1] == 'I' || tags[i][1] == 'B') && continue
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == 'O'
tags[i] = "O"
continue
end
(tags[i][1] == 'B' || tags[i][1] == 'I') && continue

if tags[i][1] == 'E'
tags[i] = 'I' * tags[i][2:end]
elseif tags[i][1] == 'S'
tags[i] = 'B' * tags[i][2:end]
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES)
tag_scheme!(tags, BIO1(), BIO2())
tag_scheme!(tags, BIO2(), BIOES())
end

function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1)
tag_scheme!(tags, BIOES(), BIO2())
tag_scheme!(tags, BIO2(), BIO1())
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ include("lda.jl")
include("summarizer.jl")
include("sentiment.jl")
include("bayes.jl")
include("taggingschemes.jl")
include("rouge.jl")
include("averagePerceptronTagger.jl")

Expand Down
41 changes: 41 additions & 0 deletions test/taggingschemes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
@testset "Tagging_Schemes" begin
@testset "BIO1 and BIO2" begin
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"]
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"]

output_tags = deepcopy(tags_BIO1)
tag_scheme!(tags_BIO1, "BIO1", "BIO2")
@test tags_BIO1 == tags_BIO2

tag_scheme!(tags_BIO1, "BIO2", "BIO1")
@test tags_BIO1 == output_tags
end

@testset "BIO1 and BIOES" begin
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER",
"I-PER", "I-PER"]
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
"I-PER", "E-PER"]

output_tags = deepcopy(tags_BIO1)
tag_scheme!(tags_BIO1, "BIO1", "BIOES")
@test tags_BIO1 == tags_BIOES

tag_scheme!(tags_BIO1, "BIOES", "BIO1")
@test tags_BIO1 == output_tags
end

@testset "BIO2 and BIOES" begin
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER",
"I-PER", "I-PER"]
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
"I-PER", "E-PER"]

output_tags = deepcopy(tags_BIO2)
tag_scheme!(tags_BIO2, "BIO2", "BIOES")
@test tags_BIO2 == tags_BIOES

tag_scheme!(tags_BIO2, "BIOES", "BIO2")
@test tags_BIO2 == output_tags
end
end